Spaces:

LaboLabrie
/

CycIF

Sleeping

App Files Files Community

KashyapiNagaHarshitha commited on Jun 19, 2024

Commit

b10249c

verified ·

1 Parent(s): 5a89c15

Upload 3 files

Browse files

Files changed (3) hide show

Quality_Control.py +1796 -0
my_modules.py +468 -0
stored_variables.json +6 -0

Quality_Control.py ADDED Viewed

	@@ -0,0 +1,1796 @@

+#!/usr/bin/env python
+# coding: utf-8
+import warnings
+import os
+import plotly as plt
+import seaborn as sb
+import plotly.express as px
+import panel as pn
+import holoviews as hv
+import hvplot.pandas
+import pandas as pd
+import numpy as np
+import json
+import matplotlib.pyplot as plt
+from bokeh.plotting import figure
+from bokeh.io import push_notebook, show
+from bokeh.io.export import export_png
+from bokeh.resources import INLINE
+from bokeh.embed import file_html
+from bokeh.io import curdoc
+from bokeh.models import Span, Label
+from bokeh.models import ColumnDataSource, Button
+from my_modules import *
+#Silence FutureWarnings & UserWarnings
+warnings.filterwarnings('ignore', category= FutureWarning)
+warnings.filterwarnings('ignore', category= UserWarning)
+'''get_ipython().run_line_magic('store', '-r base_dir')
+get_ipython().run_line_magic('store', '-r set_path')
+get_ipython().run_line_magic('store', '-r ls_samples')
+get_ipython().run_line_magic('store', '-r selected_metadata_files')'''
+'''# Retrieve the variables from the JSON file
+with open('stored_variables.json', 'r') as file:
+    stored_vars = json.load(file)
+base_dir = stored_vars['base_dir']
+set_path = stored_vars['set_path']
+selected_metadata_files = stored_vars['selected_metadata_files']
+ls_samples =  stored_vars['ls_samples']
+print(f"Base Directory: {base_dir}")
+print(f"Set Path: {set_path}")
+print(f"Selected_metadata_files: {selected_metadata_files}")
+print(base_dir)
+print(set_path)
+print(ls_samples)
+print(selected_metadata_files)'''
+base_dir = '/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431'
+set_path = 'test'
+selected_metadata_files = "['Slide_B_DD1s1.one_1.tif.csv', 'Slide_B_DD1s1.one_2.tif.csv']"
+ls_samples = "['Ashlar_Exposure_Time.csv', 'new_data.csv', 'DD3S1.csv', 'DD3S2.csv', 'DD3S3.csv', 'TMA.csv']"
+pn.extension()
+update_button = pn.widgets.Button(name='CSV Files', button_type='primary')
+def update_samples(event):
+    with open('/Users/harshithakolipaka/Desktop/CycIF_platform_py/stored_variables.json', 'r') as file:
+        stored_vars = json.load(file)
+        ls_samples = stored_vars['ls_samples']
+    print(ls_samples)
+update_button.on_click(update_samples)
+csv_files_button = pn.widgets.Button(icon="clipboard", name = " Click on the clipboard to display the selected files", button_type="primary")
+indicator = pn.indicators.LoadingSpinner(value=False, size=25)
+def handle_click(clicks):
+    with open('/Users/harshithakolipaka/Desktop/CycIF_platform_py/stored_variables.json', 'r') as file:
+        stored_vars = json.load(file)
+        ls_samples = stored_vars['ls_samples']
+    return f'CSV Files Selected: {ls_samples}'
+pn.Row(
+    csv_files_button,
+    pn.bind(handle_click, csv_files_button.param.clicks),
+)
+# ## I.2. *DIRECTORIES
+set_path = 'test'
+# Set base directory
+directorio_actual = os.getcwd()
+print(directorio_actual)
+##### MAC WORKSTATION #####
+#base_dir = r'/Volumes/LaboLabrie/Projets/OC_TMA_Pejovic/Temp/Zoe/CyCIF_pipeline/'
+###########################
+##### WINDOWS WORKSTATION #####
+#base_dir = r'C:\Users\LaboLabrie\gerz2701\cyCIF-pipeline\Set_B'
+###############################
+input_path = base_dir
+##### LOCAL WORKSTATION #####
+#base_dir = r'/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/'
+base_dir = input_path
+print(base_dir)
+#############################
+#set_name = 'Set_A'
+#set_name = 'test'
+set_name = set_path
+project_name = set_name              # Project name
+step_suffix = 'qc_eda'               # Curent part (here part I)
+previous_step_suffix_long = ""       # Previous part (here empty)
+# Initial input data directory
+input_data_dir = os.path.join(base_dir, project_name + "_data")
+# QC/EDA output directories
+# global output
+output_data_dir = os.path.join(base_dir, project_name + "_" + step_suffix)
+# images subdirectory
+output_images_dir = os.path.join(output_data_dir,"images")
+# Data and Metadata directories
+# global data
+metadata_dir = os.path.join(base_dir, project_name + "_metadata")
+# images subdirectory
+metadata_images_dir = os.path.join(metadata_dir,"images")
+# Create directories if they don't already exist
+for d in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata_dir, metadata_images_dir]:
+    if not os.path.exists(d):
+        print("Creation of the" , d, "directory...")
+        os.makedirs(d)
+    else :
+        print("The", d, "directory already exists !")
+os.chdir(input_data_dir)
+with open('/Users/harshithakolipaka/Desktop/CycIF_platform_py/stored_variables.json', 'r') as file:
+        stored_vars = json.load(file)
+        ls_samples = stored_vars['ls_samples']
+        selected_metadata_files = stored_vars['selected_metadata_files']
+directories = []
+for i in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata_dir, metadata_images_dir]:
+    directories.append(i)
+directories
+def print_directories(directories):
+    label_path = []
+    labels = [
+        "base_dir",
+        "input_data_dir",
+        "output_data_dir",
+        "output_images_dir",
+        "metadata_dir",
+        "metadata_images_dir"
+    ]
+    for label, path in zip(labels, directories):
+        label_path.append(f"{label} : {path}")
+    return label_path
+print_directories
+# Verify paths
+print('base_dir :', base_dir)
+print('input_data_dir :', input_data_dir)
+print('output_data_dir :', output_data_dir)
+print('output_images_dir :', output_images_dir)
+print('metadata_dir :', metadata_dir)
+print('metadata_images_dir :', metadata_images_dir)
+# ## I.3. FILES
+# Listing all the .csv files in the metadata/data directory
+# Don't forget to move the csv files into the proj_data directory
+# if the data dir is empty it's not going to work
+#ls_samples = [sample for sample in os.listdir(input_data_dir) if sample.endswith(".csv")]
+print("The following CSV files were detected:\n\n",[sample for sample in ls_samples], "\n\nin", input_data_dir, "directory.")
+# In[26]:
+import os
+import pandas as pd
+def combine_and_save_metadata_files(metadata_dir, selected_metadata_files):
+    if len(selected_metadata_files) == []:
+        if not file:
+            warnings.warn("No Ashlar file uploaded. Please upload a valid file.", UserWarning)
+            return
+    elif len(selected_metadata_files) > 1:
+        combined_metadata_df = pd.DataFrame()
+        for file in selected_metadata_files:
+            file_path = os.path.join(metadata_dir, file)
+            df = pd.read_csv(file_path)
+            combined_metadata_df = pd.concat([combined_metadata_df, df], ignore_index=True)
+        combined_metadata_df.to_csv(os.path.join(metadata_dir, "combined_metadata.csv"), index=False)
+        print(f"Combined metadata file saved as 'combined_metadata.csv' in {metadata_dir}")
+        return combined_metadata_df
+    else:
+        if selected_metadata_files:
+            single_file_path = os.path.join(metadata_dir, selected_metadata_files[0])
+            single_file_df = pd.read_csv(single_file_path)
+            print(f"Only one file selected: {selected_metadata_files[0]}")
+            return single_file_df
+        else:
+            print("No metadata files selected.")
+            return pd.DataFrame()
+# In[27]:
+print(combine_and_save_metadata_files(metadata_dir, selected_metadata_files))
+# In[28]:
+ls_samples
+# In[29]:
+df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]),index_col = 0, nrows = 1)
+df.head(10)
+# In[30]:
+# First gather information on expected headers using first file in ls_samples
+# Read in the first row of the file corresponding to the first sample (index = 0) in ls_samples
+df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]) , index_col = 0, nrows = 1)
+# Make sure the file was imported correctly
+print("df :\n", df.head(), "\n")
+print("df's columns :\n", df.columns, "\n")
+print("df's index :\n", df.index, "\n")
+print("df's index name :\n", df.index.name)
+# In[31]:
+df.head()
+# In[32]:
+# Verify that the ID column in input file became the index
+# Verify that the index name column is "ID", if not, rename it
+if df.index.name != "ID":
+    print("Expected the first column in input file (index_col = 0) to be 'ID'. \n"
+          "This column will be used to set the index names (cell number for each sample). \n"
+          "It appears that the column '" + df.index.name + "' was actually the imported as the index column.")
+    #df.index.name = 'ID'
+    print("A new index name (first column) will be given ('ID') to replace the current one '" + df.index.name + "'\n")
+# Apply the changes to the headers as specified with apply_header_changes() function (in my_modules.py)
+# Apply the changes to the dataframe rows as specified with apply_df_changes() function (in my_modules.py)
+#df = apply_header_changes(df)
+print(df.index)
+df.index = df.index.str.replace(r'@1$', '')
+df = apply_df_changes(df)
+# Set variable to hold default header values
+expected_headers = df.columns.values
+expected_header = True
+print(expected_header)
+intial_dataframe = df
+# Make sure the file is now formated correctly
+print("\ndf :\n", df.head(), "\n")
+print("df's columns :\n", df.columns, "\n")
+print("df's index :\n", df.index, "\n")
+print("df's index name :\n", df.index.name)
+# In[33]:
+df.head()
+# In[34]:
+df.head()
+# In[35]:
+print("Used " + ls_samples[0] + " to determine the expected and corrected headers for all files.\n")
+print("These headers are: \n" + ", ".join([h for h in expected_headers]))
+corrected_headers = True
+# In[36]:
+for sample in ls_samples:
+    file_path = os.path.join(input_data_dir,sample)
+    print(file_path)
+# In[37]:
+# Import all the others files
+dfs = {}
+###############################
+# !! This may take a while !! #
+###############################
+errors = []
+for sample in ls_samples:
+    file_path = os.path.join(input_data_dir,sample)
+    try:
+        # Read the CSV file
+        df = pd.read_csv(file_path, index_col=0)
+        # Check if the DataFrame is empty, if so, don't continue trying to process df and remove it
+        if not df.empty:
+            # Manipulations necessary for concatenation
+            df = apply_header_changes(df)
+            df = apply_df_changes(df)
+            # Reorder the columns to match the expected headers list
+            #df = df.reindex(columns=expected_headers)
+            print(df.head(1))
+            print(sample, "file is processed !\n")
+            #print(df)
+            # Compare df's header df against what is expected
+            compare_headers(expected_headers, df.columns.values, sample)
+            #print(df.columns.values)
+            # Add a new colunm to identify the csv file (sample) where the df comes from
+            df['Sample_ID'] = sample
+    except pd.errors.EmptyDataError:
+        errors.append(f'\nEmpty data error in {sample} file. Removing from analysis...')
+        print(f'\nEmpty data error in {sample} file. Removing from analysis...')
+        ls_samples.remove(sample)
+    # Add df to dfs
+    dfs[sample] = df
+print(dfs)
+dfs.values()
+# Merge dfs into one df
+df = pd.concat(dfs.values(), ignore_index=False , sort = False)
+del dfs
+merge = True
+merged_dataframe = df
+df.head()
+# Set index to Sample_ID + cell number :
+# create a new custom index for df based on the sample names and integer cell numbers, and then remove the temporary columns 'level_0' and 'index' that were introduced during the operations
+# Creates a copy of the DataFrame df and resets its index without creating a new column for the old index
+# This essentially removes the old index column and replaces it with a default integer index
+df = df.copy().reset_index(drop=True)
+#print(df)
+# Initializing an empty list index to store the new index labels for the DataFrame
+index = []
+for sample in ls_samples:
+    # Extract a chunk of data from the original df where the 'Sample_ID' column matches the current sample name
+    # This chunk is stored in the df_chunk df, which is a subset of the original data for that specific sample
+    df_chunk = df.loc[df['Sample_ID'] == sample,:].copy()
+    old_index = df_chunk.index
+    # Reset the index of the df_chunk df, removing the old index and replacing it with a default integer index
+    df_chunk = df_chunk.reset_index(drop=True)
+    # A new index is created for the df_chunk df. It combines the sample name with 'Cell_' and the integer index values, converting them to strings
+    # This new index will have labels like 'SampleName_Cell_0', 'SampleName_Cell_1', and so on.
+    sample = sample.split('.')[0]
+    df_chunk = df_chunk.set_index(f'{sample}_Cell_' + df_chunk.index.astype(str))
+    # The index values of df_chunk are then added to the index list
+    index = index + df_chunk.index.values.tolist()
+# After processing all the samples in the loop, assign the index list as the new index of the original df.
+df.index =  index
+# Remove the 'level_0' and 'index' columns from df
+df = df.loc[:,~df.columns.isin(['level_0','index'])]
+assigned_new_index = True
+df.head()
+# ### I.3.2. NOT_INTENSITIES
+# not_intensities is the list of the columns unrelated to the markers fluorescence intensities
+# Can include items that aren't in a given header.
+#not_intensitiehttp://localhost:8888/lab/tree/Downloads/wetransfer_data-zip_2024-05-17_1431/1_qc_eda.ipynb
+#I.3.2.-NOT_INTENSITIESs = ['Nuc_X', 'Nuc_X_Inv', 'Nuc_Y', 'Nuc_Y_Inv', 'Nucleus_Roundness', 'Nucleus_Size', 'Cell_Size',
+#                   'ROI_index', 'Sample_ID', 'replicate_ID', 'Cell_ID','cell_type', 'cell_subtype', 'cluster','ID',
+#                  'Cytoplasm_Size', 'immune_checkpoint', 'Unique_ROI_index', 'Patient', 'Primary_chem(1)_vs_surg(0)']
+# not_intensities is the list of the columns unrelated to the markers fluorescence intensities
+# Can include items that aren't in a given header.
+#not_intensities = ['Nuc_X', 'Nuc_X_Inv', 'Nuc_Y', 'Nuc_Y_Inv', 'Nucleus_Roundness', 'Nucleus_Size', 'Cell_Size',
+#                   'ROI_index', 'Sample_ID', 'replicate_ID', 'Cell_ID','cell_type', 'cell_subtype', 'cluster','ID',
+#                   'Cytoplasm_Size', 'immune_checkpoint', 'Unique_ROI_index', 'Patient', 'Primary_chem(1)_vs_surg(0)']
+# Get all column names
+all_columns = df.columns.tolist()
+# Create a list to store non-intensity column names
+not_intensities = []
+intensity_columns = []
+# Iterate over each column name
+for column in all_columns:
+    # Check if the column name contains 'Intensity_Average'
+    if 'Intensity_Average' not in column:
+        print(not_intensities)
+        not_intensities.append(column)
+    else:
+        intensity_columns.append(column)
+# Create a new DataFrame with non-intensity columns
+not_intensities_df = pd.DataFrame(not_intensities)
+print("Non-intensity columns:")
+print(not_intensities)
+print("non-intensity DataFrame:")
+not_intensities
+#print(len(intensity_columns))
+pd.DataFrame(not_intensities)
+path_not_intensities = os.path.join(metadata_dir,"not_intensities.csv")
+# If this file already exists, add only not_intensities items of the list not already present in file
+if os.path.exists(path_not_intensities):
+    print("'not_intensities.csv' already exists.")
+    print("Reconciling file and Jupyter notebook lists.")
+    file_not_intensities = open(path_not_intensities, "r")
+    file_ni = file_not_intensities.read().splitlines()
+    # Set difference to identify items not already in file
+    to_add = set(not_intensities) - set(file_ni)
+    # We want not_intensities to the a complete list
+    not_intensities = list(set(file_ni) | set(not_intensities))
+    file_not_intensities.close()
+    file_not_intensities = open(path_not_intensities, "a")
+    for item in to_add:
+        file_not_intensities.write(item +"\n")
+    file_not_intensities.close()
+else:
+    # The file does not yet exist
+    print("Could not find " + path_not_intensities + ". Creating now.")
+    file_not_intensities = open(path_not_intensities, "w")
+    for item in not_intensities:
+        file_not_intensities.write(item + "\n")
+    file_not_intensities.close()
+# In[46]:
+not_intensities_df = pd.read_csv(path_not_intensities)
+not_intensities_df
+# In[47]:
+# Columns we want to keep: not_intensities, and any intensity column that contains 'Intensity_Average' (drop any intensity marker column that is not a mean intensity)
+to_keep = not_intensities + [x for x in df.columns.values[~df.columns.isin(not_intensities)] if 'Intensity_Average' in x]
+to_keep
+# In[48]:
+print(len(to_keep) - 1)
+# In[49]:
+# However, our to_keep list contains items that might not be in our df headers!
+# These items are from our not_intensities list. So let's ask for only those items from to_keep that are actually found in our df
+# Retains only the columns from the to_keep list that are found in the df's headers (columns).
+# This ensures that we are only keeping the columns that exist in your df, avoiding any potential issues with non-existent column names.
+# The result is a df containing only the specified columns.
+df = df[[x for x in to_keep if x in df.columns.values]]
+df.head()
+# In[50]:
+import pandas as pd
+# Assuming you have a DataFrame named 'df'
+# df = pd.read_csv('your_file.csv')
+# Get all column names
+all_columns = df.columns.tolist()
+# Create an empty list to store intensity markers
+intensity_marker = []
+# Iterate over each column name
+for column in all_columns:
+    # Check if the column name contains 'Intensity_Average'
+    if 'Intensity_Average' in column:
+        # Split the column name by underscore
+        parts = column.split('_')
+        # Extract the word before the first underscore
+        marker = parts[0]
+        # Add the marker to the intensity_marker list
+        intensity_marker.append(marker)
+# Remove duplicates from the intensity_marker list
+intensity_marker = list(set(intensity_marker))
+print("Intensity Markers:")
+print(intensity_marker)
+# Create a callback function to update the intensities array
+def update_intensities(event):
+    global intensities
+    global intensities_df
+    new_intensities = []
+    selected_columns = []
+    for marker, cell, cytoplasm, nucleus in zip(marker_options_df['Marker'], marker_options_df['Cell'], marker_options_df['Cytoplasm'], marker_options_df['Nucleus']):
+        if cell:
+            new_intensities.append(f"{marker}_Cell_Intensity_Average")
+            selected_columns.append(f"{marker}_Cell_Intensity_Average")
+        if cytoplasm:
+            new_intensities.append(f"{marker}_Cytoplasm_Intensity_Average")
+            selected_columns.append(f"{marker}_Cytoplasm_Intensity_Average")
+        if nucleus:
+            new_intensities.append(f"{marker}_Nucleus_Intensity_Average")
+            selected_columns.append(f"{marker}_Nucleus_Intensity_Average")
+    intensities = new_intensities
+    if selected_columns:
+        intensities_df = merged_dataframe[selected_columns]
+    else:
+        intensities_df = pd.DataFrame()
+    print("Updated intensities DataFrame:")
+    print(intensities_df)
+# In[54]:
+tabulator_formatters = {
+    'bool': {'type': 'tickCross'}
+}
+# Create a DataFrame with the intensity markers and default values
+marker_options_df = pd.DataFrame({
+    'Marker': intensity_marker,
+    'Cell': [False] * len(intensity_marker),
+    'Cytoplasm': [False] * len(intensity_marker),
+    'Nucleus': [False] * len(intensity_marker)
+})
+# Create the Tabulator widget and link the callback function
+tabulator = pn.widgets.Tabulator(marker_options_df, formatters=tabulator_formatters, sizing_mode='stretch_width')
+tabulator.param.watch(update_intensities,'value')
+# Create a Panel layout with the Tabulator widget
+marker_options_layout = pn.Column(tabulator, sizing_mode="stretch_width")
+import panel as pn
+import pandas as pd
+import random
+import asyncio
+# Initialize the Panel extension with Tabulator
+pn.extension('tabulator')
+# Create a DataFrame with the intensity markers and default values
+marker_options_df = pd.DataFrame({
+    'Marker': intensity_marker,
+    'Cell': [True] * len(intensity_marker),
+    'Cytoplasm': [False] * len(intensity_marker),
+    'Nucleus': [False] * len(intensity_marker)
+})
+# Define formatters for the Tabulator widget
+tabulator_formatters = {
+    'Cell': {'type': 'tickCross'},
+    'Cytoplasm': {'type': 'tickCross'},
+    'Nucleus': {'type': 'tickCross'}
+}
+# Create the Tabulator widget
+tabulator = pn.widgets.Tabulator(marker_options_df, formatters=tabulator_formatters, sizing_mode='stretch_width')
+# Create a DataFrame to store the initial intensities
+new_data = [{'Description': f"{marker}_Cell_Intensity_Average"} for marker in intensity_marker if True]
+new_data_df = pd.DataFrame(new_data)
+# Create a widget to display the new data as a DataFrame
+new_data_table = pn.widgets.Tabulator(new_data_df, name='New Data Table', sizing_mode='stretch_width')
+# Create a button to start the update process
+run_button = pn.widgets.Button(name="Save Selection", button_type='primary')
+# Define the update_intensities function
+def update_intensities():
+    global new_data, new_data_df
+    new_data = []
+    for _, row in tabulator.value.iterrows():
+        marker = row['Marker']
+        if row['Cell']:
+            new_data.append({'Description': f"{marker}_Cell_Intensity_Average"})
+        if row['Cytoplasm']:
+            new_data.append({'Description': f"{marker}_Cytoplasm_Intensity_Average"})
+        if row['Nucleus']:
+            new_data.append({'Description': f"{marker}_Nucleus_Intensity_Average"})
+    new_data_df = pd.DataFrame(new_data)
+    new_data_table.value = new_data_df
+# Define the runner function
+async def runner(event):
+    update_intensities()
+# Bind the runner function to the button
+run_button.on_click(runner)
+# Layout
+updated_intensities = pn.Column(tabulator, run_button, new_data_table, sizing_mode="stretch_width")
+pn.extension()
+# Serve the layout
+#updated_intensities.servable()
+intensities_df = new_data_table
+intensities_df
+intensities_df = pn.pane.DataFrame(intensities_df)
+intensities_df
+print(intensities_df)
+# ## I.4. QC CHECKS
+def quality_check_results(check_shape, check_no_null,check_zero_intensities):
+    results = [
+        f"Check Index: {check_index}",
+        f"Check Shape: {check_shape}",
+        f"Check No Null: {check_no_null}",
+        f"Check Zero Intensities: {check_zero_intensities}"
+    ]
+    return pn.Column(*[pn.Row(result) for result in results], sizing_mode="stretch_width")
+print(ls_samples)
+def check_index_format(index_str, ls_samples):
+    """
+    Checks if the given index string follows the specified format.
+    Args:
+        index_str (str): The index string to be checked.
+        ls_samples (list): A list of valid sample names.
+    Returns:
+        bool: True if the index string follows the format, False otherwise.
+    """
+    # Split the index string into parts
+    parts = index_str.split('_')
+    # Check if there are exactly 3 parts
+    if len(parts) != 3:
+        print(len(parts))
+        return False
+    # Check if the first part is in ls_samples
+    sample_name = parts[0]
+    if f'{sample_name}.csv' not in ls_samples:
+        print(sample_name)
+        return False
+    # Check if the second part is in ['cell', 'cytoplasm', 'nucleus']
+    location = parts[1]
+    valid_locations = ['Cell', 'Cytoplasm', 'Nucleus']
+    if location not in valid_locations:
+        print(location)
+        return False
+    # Check if the third part is a number
+    try:
+        index = int(parts[2])
+    except ValueError:
+        print(index)
+        return False
+    # If all checks pass, return True
+    return True
+# In[70]:
+# Let's take a look at a few features to make sure our dataframe is as expected
+df.index
+def check_format_ofindex(index):
+    for index in df.index:
+        check_index = check_index_format(index, ls_samples)
+        if check_index is False:
+            index_format = "Bad"
+            return index_format
+    index_format = "Good"
+    return index_format
+print(check_format_ofindex(df.index))
+# In[71]:
+df.shape
+check_index = df.index
+check_shape = df.shape
+print(check_shape)
+# In[72]:
+# Check for NaN entries (should not be any unless columns do not align)
+# False means no NaN entries
+# True means NaN entries
+df.isnull().any().any()
+check_no_null = df.isnull().any().any()
+# In[73]:
+# Check that all expected files were imported into final dataframe
+if sorted(df.Sample_ID.unique()) == sorted(ls_samples):
+    print("All expected filenames are present in big df Sample_ID column.")
+    check_all_expected_files_present = "All expected filenames are present in big df Sample_ID column."
+else:
+    compare_headers(['no samples'], df.Sample_ID.unique(), "big df Sample_ID column")
+    check_all_expected_files_present = compare_headers(['no samples'], df.Sample_ID.unique(), "big df Sample_ID column")
+print(df.Sample_ID)
+# In[74]:
+# Delete rows that have 0 value mean intensities for intensity columns
+print("df.shape before removing 0 mean values: ", df.shape)
+# We use the apply method on df to calculate the mean intensity for each row. It's done this by applying a lambda function to each row.
+# The lambda function excludes the columns listed in the not_intensities list (which are not to be considered for mean intensity calculations)
+# and calculates the mean of the remaining values in each row.
+###############################
+# !! This may take a while !! #
+###############################
+# Calculate mean intensity excluding 'not_intensities' columns
+mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
+# Check if there are any 0 mean intensity values
+if (mean_intensity == 0).any():
+    df = df.loc[mean_intensity > 0, :]
+    print("Shape after removing 0 mean values: ", df.shape)
+    check_zero_intensities = f'df.shape after removing 0 mean values: {df.shape}'
+else:
+    print("No zero intensity values.")
+    check_zero_intensities = " No zero intensity values found in the DataFrame."
+# Get quantiles (5th, 50th, 95th)
+# List of nucleus size percentiles to extract
+#qs = [0.05,0.50,0.95]
+#df["Nucleus_Size"].quantile(q=qs)
+quality_control_df = df
+quality_control_df.head()
+# Function to perform quality checks
+def perform_quality_checks(df, ls_samples, not_intensities):
+    results = {}
+    errors = []
+    # Check index
+    results['index'] = df.index
+    # Check shape
+    results['shape'] = df.shape
+    # Check for NaN entries
+    results['nan_entries'] = df.isnull().any().any()
+    # Remove rows with 0 mean intensity values
+    mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
+    if (mean_intensity == 0).any():
+        df = df.loc[mean_intensity > 0, :]
+        results['zero_intensity_removal'] = f"Zero intensity entires are found and removed. Shape after removing: {df.shape}"
+    else:
+        results['zero_intensity_removal'] = "No zero intensity values found in the DataFrame."
+    return results
+# Example usage of the function
+quality_check_results = perform_quality_checks(df, ls_samples, not_intensities)
+# Print results
+for key, value in quality_check_results.items():
+    print(f"{key}: {value}")
+# In[80]:
+import panel as pn
+import pandas as pd
+def quality_check(file, not_intensities):
+    # Load the output file
+    df = file
+    # Check Index
+    check_index = check_format_ofindex(df.index)
+    # Check Shape
+    check_shape = df.shape
+    # Check for NaN entries
+    check_no_null = df.isnull().any().any()
+    mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
+    if (mean_intensity == 0).any():
+        df = df.loc[mean_intensity > 0, :]
+        print("df.shape after removing 0 mean values: ", df.shape)
+        check_zero_intensities = f'df.shape after removing 0 mean values: {df.shape}'
+    else:
+        print("No zero intensity values found in the DataFrame.")
+        check_zero_intensities = "No zero intensities."
+    # Create a quality check results table
+    quality_check_results_table = pd.DataFrame({
+        'Check': ['Index', 'Shape', 'Check for NaN Entries', 'Check for Zero Intensities'],
+        'Result': [str(check_index), str(check_shape), str(check_no_null), check_zero_intensities]
+    })
+    # Create a quality check results component
+    quality_check_results_component = pn.Card(
+        pn.pane.DataFrame(quality_check_results_table),
+        title="Quality Control Results",
+        header_background="#2196f3",
+        header_color="white",
+    )
+    return quality_check_results_component
+quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
+# Function to calculate quantile values
+def calculate_quantiles(quantile):
+    quantile_value_intensity = df["AF555_Cell_Intensity_Average"].quantile(q=[quantile, 0.50, 1 - quantile])
+    return quantile_value_intensity
+# Function to create the Panel app
+def create_app(quantile = quantile_slider.param.value):
+    quantiles = calculate_quantiles(quantile)
+    output = pd.DataFrame(quantiles)
+    # Create a Markdown widget to display the output
+    output_widget = pn.pane.DataFrame(output)
+    return output_widget
+# Bind the create_app function to the quantile slider
+quantile_output_app = pn.bind(create_app, quantile_slider.param.value)
+#pn.Column(quantile_slider,quantile_output_app).servable()
+# Function to create the line graph plot using Bokeh
+def create_line_graph2(quantile):
+    # Calculate histogram
+    hist, edges = np.histogram(df['Nucleus_Size'], bins=30)
+    # Calculate the midpoints of bins for plotting
+    midpoints = (edges[:-1] + edges[1:]) / 2
+    # Calculate quantiles
+    qs = [quantile, 0.50, 1.00 - quantile]
+    quantiles = df['Nucleus_Size'].quantile(q=qs).values
+    # Create Bokeh line graph plot
+    p = figure(title='Frequency vs. Nucleus_Size',
+               x_axis_label='Nucleus_Size',
+               y_axis_label='Frequency',
+               width=800, height=400)
+    # Plotting histogram
+    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
+           fill_color='skyblue', line_color='black', alpha=0.6)
+    # Plotting line graph
+    p.line(midpoints, hist, line_width=2, color='blue', alpha=0.7)
+    # Add quantile lines
+    for q in quantiles:
+        span = Span(location=q, dimension='height', line_color='red', line_dash='dashed', line_width=2)
+        p.add_layout(span)
+        p.add_layout(Label(x=q, y=max(hist), text=f'{q:.1f}', text_color='red'))
+    return p
+# Bind the create_line_graph function to the quantile slider
+nucleus_size_line_graph_with_histogram = pn.bind(create_line_graph2, quantile=quantile_slider.param.value)
+# Layout the components in a Panel app
+#nucleus_size_line_graph_with_histogram = pn.Column(create_line_graph2(quantile = quantile_slider.param.value))
+#nucleus_size_line_graph_with_histogram.servable()
+# Layout the components in a Panel app
+plot1 = pn.Column(quantile_slider, pn.pane.Bokeh(nucleus_size_line_graph_with_histogram))
+#plot1.servable()
+#Removing cells based on nucleus size
+quantile = quantile_slider.value
+qs = [quantile, 0.50, 1.00 - quantile]
+quantiles = df['Nucleus_Size'].quantile(q=qs).values
+threshold = quantiles[2]
+# In[89]:
+print(threshold)
+# In[90]:
+import panel as pn
+import pandas as pd
+import numpy as np
+from bokeh.plotting import figure
+from bokeh.models import Span, Label
+# Define the quantile slider
+#quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
+# Function to update the threshold and display number of cells removed
+def update_threshold_and_display(quantile):
+    qs = [quantile, 0.50, 1.00 - quantile]
+    quantiles = df['Nucleus_Size'].quantile(q=qs).values
+    threshold = quantiles[2]
+    # Filter the DataFrame based on the new threshold
+    df_filtered = df.loc[(df['Nucleus_Size'] > 42) & (df['Nucleus_Size'] < threshold)]
+    # Calculate the number of cells removed
+    cells_before_filter = df.shape[0]
+    cells_after_filter = df_filtered.shape[0]
+    cells_removed = cells_before_filter - cells_after_filter
+    # Display the results
+    results = pn.Column(
+        f"Number of cells before filtering: {cells_before_filter}",
+        f"Number of cells after filtering on nucleus size: {cells_after_filter}",
+        f"Number of cells removed: {cells_removed}"
+    )
+    return results
+# Bind the update function to the quantile slider
+results_display = pn.bind(update_threshold_and_display, quantile_slider)
+# Layout the components in a Panel app
+layout2 = results_display
+# In[91]:
+print("Number of cells before filtering :", df.shape[0])
+cells_before_filter = f"Number of cells before filtering :{df.shape[0]}"
+# Delete small cells and objects w/high AF555 Signal (RBCs)
+# We usually use the 95th percentile calculated during QC_EDA
+df = df.loc[(df['Nucleus_Size'] > 42 )]
+df = df.loc[(df['Nucleus_Size'] < threshold)]
+cells_after_filter_nucleus_shape = df.shape[0]
+print("Number of cells after filtering on nucleus size:", df.shape[0])
+df = df.loc[(df['AF555_Cell_Intensity_Average'] < 2000)]
+print("Number of cells after filtering on AF555A ___ intensity:", df.shape[0])
+cells_after_filter_intensity_shape = df.shape[0]
+cells_after_filter_nucleus = f"Number of cells after filtering on nucleus size: {cells_after_filter_nucleus_shape}"
+cells_after_filter_intensity = f"Number of cells after filtering on AF555A ___ intensity: {cells_after_filter_intensity_shape}"
+num_of_cell_removal_intensity = cells_after_filter_intensity
+print(num_of_cell_removal_intensity )
+num_of_cell_removal = pn.Column(cells_before_filter, cells_after_filter_nucleus)
+# Assuming you have a DataFrame 'df' with the intensity columns
+intensities = df.filter(like='Intensity').columns.tolist()
+# Create a ColumnDataSource from the DataFrame
+source = ColumnDataSource(df)
+# Function to calculate quantile values
+def calculate_quantiles(column, quantile):
+    quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile]).values
+    return quantiles
+# Create the dropdown menu
+column_dropdown = pn.widgets.Select(name='Select Column', options=intensities)
+quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
+# Function to create the Bokeh plot
+def create_intensity_plot(column, quantile):
+    quantiles = calculate_quantiles(column, quantile)
+    hist, edges = np.histogram(df[column], bins = 30)
+    # Calculate the midpoints of bins for plotting
+    midpoints = (edges[:-1] + edges[1:]) / 2
+    # Create Bokeh plot
+    p = figure(title=f'Distribution of {column} with Quantiles',
+               x_axis_label=f'{column} Values',
+               y_axis_label='Frequency',
+               width=800, height=400)
+    p.quad(top=hist, bottom=0, left=edges[:-1], right= edges[1:],
+           fill_color='skyblue', line_color='black', alpha=0.7)
+    # Plotting line graph
+    p.line(midpoints, hist, line_width=2, color='blue', alpha=0.7)
+    # Add quantile lines
+    for q in quantiles:
+        span = Span(location=q, dimension='height', line_color='red', line_dash='dashed', line_width=2)
+        p.add_layout(span)
+        p.add_layout(Label(x=q, y=max(hist), text=f'{q:.1f}', text_color='red'))
+    return p
+# Bind the create_plot function to the quantile slider, column dropdown, and button click
+marker_intensity_with_histogram = pn.bind(create_intensity_plot,column_dropdown.param.value, quantile_slider.param.value, watch=True)
+# Create the button
+generate_plot_button = Button(label='Generate Plot', button_type='primary')
+def update_plot(column, quantile):
+    plot = create_intensity_plot(column, quantile)
+    plot.renderers[0].data_source = source  # Update the data source for the renderer
+    return plot
+#Display the dropdown menu, quantile slider, button, and plot
+#plot = update_plot(column_dropdown.param.value, quantile_slider.param.value)
+def generate_plot(event):
+    updated_plot = update_plot(column_dropdown.param.value, quantile_slider.param.value)
+    #pn.Column(pn.Row(column_dropdown, generate_plot_button), quantile_slider, updated_plot).servable()
+generate_plot_button.on_click(generate_plot)
+selected_marker_plot = pn.Column(pn.Row(pn.Column(column_dropdown, marker_intensity_with_histogram )))
+#pn.Column(pn.Row(pn.Column(column_dropdown, marker_intensity_with_histogram ), generate_plot_button)).servable()
+import panel as pn
+import numpy as np
+import pandas as pd
+from bokeh.plotting import figure
+from bokeh.models import ColumnDataSource, Button, Span, Label
+# Assuming you have a DataFrame 'df' with the intensity columns
+intensities = df.filter(like='Intensity').columns.tolist()
+# Create a ColumnDataSource from the DataFrame
+source = ColumnDataSource(df)
+# Function to calculate quantile values
+def calculate_quantiles(column, quantile):
+    quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile])
+    return quantiles
+# In[105]:
+quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
+# Bind the create_line_graph function to the quantile slider
+#nucleus_size_line_graph = pn.bind(create_line_graph, quantile=quantile_slider.param.value)
+# Layout the components in a Panel app
+#nucleus_size_graph = pn.Column(nucleus_size_line_graph)
+# In[106]:
+#df["CKs_Cytoplasm_Intensity_Average"].quantile(q=qs)
+# In[107]:
+len(intensities)
+if 'CKs_Cytoplasm_Intensity_Average' in intensities:
+    print(1)
+# In[108]:
+df
+# In[109]:
+def calculate_cytoplasm_quantiles(column, quantile):
+    # Print the columns of the DataFrame
+    print("DataFrame columns:", df.columns)
+    # Check if the column exists in the DataFrame
+    if column not in df.columns:
+        raise KeyError(f"Column '{column}' does not exist in the DataFrame.")
+    quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile])
+    return quantiles
+def create_cytoplasm_intensity_df(column, quantile):
+    quantiles = calculate_cytoplasm_quantiles(column, quantile)
+    output = pd.DataFrame(quantiles)
+    return pn.pane.DataFrame(output)
+# Bind the create_app function to the quantile slider
+cytoplasm_quantile_output_app = pn.bind(create_cytoplasm_intensity_df, column='CKs_Cytoplasm_Intensity_Average', quantile=quantile_slider.param.value)
+pn.Column(quantile_slider, cytoplasm_quantile_output_app)
+# In[110]:
+def calculate_cytoplasm_quantiles(column, quantile):
+    quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile])
+    return quantiles
+def create_cytoplasm_intensity_df(column, quantile):
+    quantiles = calculate_cytoplasm_quantiles(column, quantile)
+    output = pd.DataFrame(quantiles)
+    # Create a Dataframe widget to display the output
+    output_widget = pn.pane.DataFrame(output)
+    return output_widget
+# Bind the create_app function to the quantile slider
+cytoplasm_quantile_output_app = pn.bind(create_cytoplasm_intensity_df, column='CKs_Cytoplasm_Intensity_Average', quantile = quantile_slider.param.value)
+pn.Column(quantile_slider,cytoplasm_quantile_output_app)
+# ## I.5. COLUMNS OF INTERESTS
+# In[111]:
+# Remove columns containing "DAPI"
+df = df[[x for x in df.columns.values if 'DAPI' not in x]]
+print("Columns are now...")
+print([c for c in df.columns.values])
+# In[112]:
+# Create lists of full names and shortened names to use in plotting
+full_to_short_names, short_to_full_names =  \
+    shorten_feature_names(df.columns.values[~df.columns.isin(not_intensities)])
+short_to_full_names
+# In[113]:
+# Save this data to a metadata file
+filename = os.path.join(metadata_dir, "full_to_short_column_names.csv")
+fh = open(filename, "w")
+fh.write("full_name,short_name\n")
+for k,v in full_to_short_names.items():
+    fh.write(k + "," + v + "\n")
+fh.close()
+print("The full_to_short_column_names.csv file was created !")
+# In[114]:
+# Save this data to a metadata file
+filename = os.path.join(metadata_dir, "short_to_full_column_names.csv")
+fh = open(filename, "w")
+fh.write("short_name,full_name\n")
+for k,v in short_to_full_names.items():
+    fh.write(k + "," + v + "\n")
+fh.close()
+print("The short_to_full_column_names.csv file was created !")
+# ## I.6. EXPOSURE TIME
+# In[115]:
+#import the ashlar analysis file
+file_path = os.path.join(metadata_dir, 'combined_metadata.csv')
+ashlar_analysis = pd.read_csv(file_path)
+ashlar_analysis
+# In[116]:
+# Extracting and renaming columns
+new_df = ashlar_analysis[['Name', 'Cycle', 'ChannelIndex', 'ExposureTime']].copy()
+new_df.rename(columns={
+    'Name': 'Target',
+    'Cycle': 'Round',
+    'ChannelIndex': 'Channel'
+}, inplace=True)
+# Applying suffixes to the columns
+new_df['Round'] = 'R' + new_df['Round'].astype(str)
+new_df['Channel'] = 'c' + new_df['Channel'].astype(str)
+# Save to CSV
+new_df.to_csv('Ashlar_Exposure_Time.csv', index=False)
+# Print the new dataframe
+print(new_df)
+# In[117]:
+# Here, we want to end up with a data structure that incorporates metadata on each intensity marker column used in our big dataframe in an easy-to-use format.
+# This is going to include the full name of the intensity marker columns in the big data frame,
+# the corresponding round and channel,
+# the target protein (e.g., CD45),
+# and the segmentation localization information (cell, cytoplasm, nucleus)
+# We can use this data structure to assign unique colors to all channels and rounds, for example, for use in later visualizations
+# Exposure_time file from ASHLAR analysis
+filename = "Exposure_Time.csv"
+filename = os.path.join(metadata_dir, filename)
+exp_df = pd.read_csv(filename)
+print(exp_df)
+# Verify file imported correctly
+# File length
+print("df's shape: ", exp_df.shape)
+# Headers
+expected_headers =['Round','Target','Exp','Channel']
+compare_headers(expected_headers, exp_df.columns.values, "Imported metadata file")
+# Missingness
+if exp_df.isnull().any().any():
+    print("\nexp_df has null value(s) in row(s):")
+    print(exp_df[exp_df.isna().any(axis=1)])
+else:
+    print("\nNo null values detected.")
+# In[118]:
+if len(exp_df['Target']) > len(exp_df['Target'].unique()):
+    print("One or more non-unique Target values in exp_df. Currently not supported.")
+exp_df = exp_df.drop_duplicates(subset = 'Target').reindex()
+# In[119]:
+# sort exp_df by the values in the 'Target' column in ascending order and then retrieve the first few rows of the sorted df
+exp_df.sort_values(by = ['Target']).head()
+# In[120]:
+# Create lowercase version of target
+exp_df['target_lower'] = exp_df['Target'].str.lower()
+exp_df.head()
+# In[121]:
+# Create df that contains marker intensity columns in our df that aren't in not_intensities
+intensities = pd.DataFrame({'full_column':df.columns.values[~df.columns.isin(not_intensities)]})
+intensities
+# In[122]:
+# Extract the marker information from the `full_column`, which corresponds to full column in big dataframe
+# Use regular expressions (regex) to isolate the part of the field that begins (^) with an alphanumeric value (W), and ends with an underscore (_)
+# '$' is end of line
+intensities['marker'] = intensities['full_column'].str.extract(r'([^\W_]+)')
+# convert to lowercase
+intensities['marker_lower'] = intensities['marker'].str.lower()
+intensities
+# In[123]:
+# Subset the intensities df to exclude any column pertaining to DAPI
+intensities = intensities.loc[intensities['marker_lower'] != 'dapi']
+intensities.head()
+# In[124]:
+# Merge the intensities andexp_df together to create metadata
+metadata = pd.merge(exp_df, intensities, how = 'left', left_on = 'target_lower',right_on = 'marker_lower')
+metadata = metadata.drop(columns = ['marker_lower'])
+metadata = metadata.dropna()
+# Target is the capitalization from the Exposure_Time.csv
+# target_lower is Target in small caps
+# marker is the extracted first component of the full column in segmentation data, with corresponding capitalization
+metadata
+# In[125]:
+# Add a column to signify marker target localisation.
+# Use a lambda to determine segmented location of intensity marker column and update metadata accordingly
+# Using the add_metadata_location() function in my_modules.py
+metadata['localisation'] = metadata.apply(
+    lambda row: add_metadata_location(row), axis = 1)
+# In[126]:
+mlid = metadata
+# In[127]:
+# Save this data structure to the metadata folder
+# don't want to add color in because that's better off treating color the same for round, channel, and sample
+filename = "marker_intensity_metadata.csv"
+filename = os.path.join(metadata_dir, filename)
+metadata.to_csv(filename, index = False)
+print("The marker_intensity_metadata.csv file was created !")
+# ## I.7. COLORS WORKFLOW
+# ### I.7.1. CHANNELS COLORS
+# we want colors that are categorical, since Channel is a non-ordered category (yes, they are numbered, but arbitrarily).
+# A categorical color palette will have dissimilar colors.
+# Get those unique colors
+if len(metadata.Channel.unique()) > 10:
+    print("WARNING: There are more unique channel values than \
+    there are colors to choose from. Select different palette, e.g., \
+    continuous palette 'husl'.")
+channel_color_values = sb.color_palette("bright",n_colors = len(metadata.Channel.unique()))
+# chose 'colorblind' because it is categorical and we're unlikely to have > 10
+# You can customize the colors for each channel here
+custom_colors = {
+    'c2': 'lightgreen',
+    'c3': 'tomato',
+    'c4': 'pink',
+    'c5': 'turquoise'
+}
+custom_colors_values = sb.palplot(sb.color_palette([custom_colors.get(ch, 'blue') for ch in metadata.Channel.unique()]))
+# Display those unique customs colors
+print("Unique channels are:", metadata.Channel.unique())
+sb.palplot(sb.color_palette(channel_color_values))
+# In[131]:
+# Function to create a palette plot with custom colors
+def create_palette_plot():
+    # Get unique channels
+    unique_channels = metadata.Channel.unique()
+    # Define custom colors for each channel
+    custom_colors = {
+        'c2': 'lightgreen',
+        'c3': 'tomato',
+        'c4': 'pink',
+        'c5': 'turquoise'
+    }
+    # Get custom colors for each channel
+    colors = [custom_colors.get(ch, 'blue') for ch in unique_channels]
+    # Create a palette plot (palplot)
+    palette_plot = sb.palplot(sb.color_palette(colors))
+    channel_color_values = sb.color_palette("bright",n_colors = len(metadata.Channel.unique()))
+    channel_color_values = sb.palplot(channel_color_values)
+    return palette_plot, channel_color_values
+# Create the palette plot directly
+palette_plot = create_palette_plot()
+# Define the Panel app layout
+app_palette_plot = pn.Column(
+    pn.pane.Markdown("### Custom Color Palette"),
+    palette_plot,
+)
+# Function to create a palette plot with custom colors
+def create_palette_plot(custom_colors):
+    # Get unique channels
+    unique_channels = metadata.Channel.unique()
+    # Get custom colors for each channel
+    colors = [custom_colors.get(ch, 'blue') for ch in unique_channels]
+    # Create a palette plot (palplot)
+    palette_plot = sb.palplot(sb.color_palette(colors))
+    return palette_plot
+# Define custom colors for each channel
+custom_colors = {
+    'c2': 'lightgreen',
+    'c3': 'tomato',
+    'c4': 'pink',
+    'c5': 'turquoise'
+}
+# Display those unique customs colo
+print("Unique channels are:", metadata.Channel.unique())
+# Function to bind create_palette_plot
+app_palette_plot = create_palette_plot(custom_colors)
+#app_palette_plot.servable()
+# In[133]:
+# Store in a dictionary
+channel_color_dict = dict(zip(metadata.Channel.unique(), channel_color_values))
+channel_color_dict
+for k,v in channel_color_dict.items():
+    channel_color_dict[k] = np.float64(v)
+channel_color_dict
+# In[134]:
+color_df_channel = color_dict_to_df(channel_color_dict, "Channel")
+# Save to file in metadatadirectory
+filename = "channel_color_data.csv"
+filename = os.path.join(metadata_dir, filename)
+color_df_channel.to_csv(filename, index = False)
+color_df_channel
+# In[135]:
+# Legend of channel info only
+g  = plt.figure(figsize = (1,1)).add_subplot(111)
+g.axis('off')
+handles = []
+for item in channel_color_dict.keys():
+        h = g.bar(0,0, color = channel_color_dict[item],
+                  label = item, linewidth =0)
+        handles.append(h)
+first_legend = plt.legend(handles=handles, loc='upper right', title = 'Channel'),
+                            # box_to_anchor=(10,10),
+                             #       bbox_transform=plt.gcf().transFigure)
+filename = "Channel_legend.png"
+filename = os.path.join(metadata_images_dir, filename)
+plt.savefig(filename, bbox_inches = 'tight')
+# ### I.7.2. ROUNDS COLORS
+# we want colors that are sequential, since Round is an ordered category.
+# We can still generate colors that are easy to distinguish. Also, many of the categorical palettes cap at at about 10 or so unique colors, and repeat from there.
+# We do not want any repeats!
+round_color_values = sb.cubehelix_palette(
+    len(metadata.Round.unique()), start=1, rot= -0.75, dark=0.19, light=.85, reverse=True)
+# round_color_values = sb.color_palette("cubehelix",n_colors = len(metadata.Round.unique()))
+# chose 'cubehelix' because it is sequential, and round is a continuous process
+# each color value is a tuple of three values: (R, G, B)
+print(metadata.Round.unique())
+sb.palplot(sb.color_palette(round_color_values))
+## TO-DO: write what these parameters mean
+# In[137]:
+# Store in a dictionary
+round_color_dict = dict(zip(metadata.Round.unique(), round_color_values))
+for k,v in round_color_dict.items():
+    round_color_dict[k] = np.float64(v)
+round_color_dict
+# In[138]:
+color_df_round = color_dict_to_df(round_color_dict, "Round")
+# Save to file in metadatadirectory
+filename = "round_color_data.csv"
+filename = os.path.join(metadata_dir, filename)
+color_df_round.to_csv(filename, index = False)
+color_df_round
+# Legend of round info only
+round_legend  = plt.figure(figsize = (1,1)).add_subplot(111)
+round_legend.axis('off')
+handles = []
+for item in round_color_dict.keys():
+        h = round_legend.bar(0,0, color = round_color_dict[item],
+                  label = item, linewidth =0)
+        handles.append(h)
+first_legend = plt.legend(handles=handles, loc='upper right', title = 'Round'),
+                            # bbox_to_anchor=(10,10),
+                             #       bbox_transform=plt.gcf().transFigure)
+filename = "Round_legend.png"
+filename = os.path.join(metadata_images_dir, filename)
+plt.savefig(filename, bbox_inches = 'tight')
+# ### I.7.3. SAMPLES COLORS
+# In[140]:
+# we want colors that are neither sequential nor categorical.
+# Categorical would be ideal if we could generate an arbitrary number of colors, but I do not think that we can.
+# Hense, we will choose `n` colors from a continuous palette. First we will generate the right number of colors. Later, we will assign TMA samples to gray.
+# Get those unique colors
+color_values = sb.color_palette("husl",n_colors = len(ls_samples))#'HLS'
+# each color value is a tuple of three values: (R, G, B)
+# Display those unique colors
+sb.palplot(sb.color_palette(color_values))
+# In[141]:
+TMA_samples = [s for s in df.Sample_ID.unique() if 'TMA' in s]
+TMA_color_values = sb.color_palette(n_colors = len(TMA_samples),palette = "gray")
+sb.palplot(sb.color_palette(TMA_color_values))
+# In[142]:
+# Store in a dictionary
+color_dict = dict()
+color_dict = dict(zip(df.Sample_ID.unique(), color_values))
+# Replace all TMA samples' colors with gray
+i = 0
+for key in color_dict.keys():
+    if 'TMA' in key:
+        color_dict[key] = TMA_color_values[i]
+        i +=1
+color_dict
+color_df_sample = color_dict_to_df(color_dict, "Sample_ID")
+# Save to file in metadatadirectory
+filename = "sample_color_data.csv"
+filename = os.path.join(metadata_dir, filename)
+color_df_sample.to_csv(filename, index = False)
+color_df_sample
+# Legend of sample info only
+g  = plt.figure(figsize = (1,1)).add_subplot(111)
+g.axis('off')
+handles = []
+for item in color_dict.keys():
+        h = g.bar(0,0, color = color_dict[item],
+                  label = item, linewidth =0)
+        handles.append(h)
+first_legend = plt.legend(handles=handles, loc='upper right', title = 'Sample')
+filename = "Sample_legend.png"
+filename = os.path.join(metadata_images_dir, filename)
+plt.savefig(filename, bbox_inches = 'tight')
+# ### I.7.4. CLUSTERS COLORS
+'''if 'cluster' in df.columns:
+    cluster_color_values = sb.color_palette("hls",n_colors = len(df.cluster.unique()))
+    #print(sorted(test_df.cluster.unique()))
+    # Display those unique colors
+    sb.palplot(sb.color_palette(cluster_color_values))
+    cluster_color_dict = dict(zip(sorted(test_df.cluster.unique()), cluster_color_values))
+    print(cluster_color_dict)
+    # Create dataframe
+    cluster_color_df = color_dict_to_df(cluster_color_dict, "cluster")
+    cluster_color_df.head()
+    # Save to file in metadatadirectory
+    filename = "cluster_color_data.csv"
+    filename = os.path.join(metadata_dir, filename)
+    cluster_color_df.to_csv(filename, index = False)
+# Legend of cluster info only
+if 'cluster' in df.columns:
+    g  = plt.figure(figsize = (1,1)).add_subplot(111)
+    g.axis('off')
+    handles = []
+    for item in sorted(cluster_color_dict.keys()):
+            h = g.bar(0,0, color = cluster_color_dict[item],
+                      label = item, linewidth =0)
+            handles.append(h)
+    first_legend = plt.legend(handles=handles, loc='upper right', title = 'Cluster'),
+    filename = "Clustertype_legend.png"
+    filename = os.path.join(metadata_images_dir, filename)
+    plt.savefig(filename, bbox_inches = 'tight')'''
+mlid.head()
+metadata
+import io
+import panel as pn
+pn.extension()
+file_input = pn.widgets.FileInput()
+file_input
+def transform_data(variable, window, sigma):
+    """Calculates the rolling average and identifies outliers"""
+    avg = metadata[variable].rolling(window=window).mean()
+    residual = metadata[variable] - avg
+    std = residual.rolling(window=window).std()
+    outliers = np.abs(residual) > std * sigma
+    return avg, avg[outliers]
+def get_plot(variable="Exp", window=30, sigma=10):
+    """Plots the rolling average and the outliers"""
+    avg, highlight = transform_data(variable, window, sigma)
+    return avg.hvplot(
+        height=300, legend=False,
+    ) * highlight.hvplot.scatter(padding=0.1, legend=False)
+variable_widget = pn.widgets.Select(name="Target", value="Exp", options=list(metadata.columns))
+window_widget = pn.widgets.IntSlider(name="window", value=30, start=1, end=60)
+sigma_widget = pn.widgets.IntSlider(name="sigma", value=10, start=0, end=20)
+app = pn.template.GoldenTemplate(
+    site="Cyc-IF",
+    title="Quality Control",
+    main=[
+        pn.Tabs(
+            ("Dataframes", pn.Column(
+                pn.Row(csv_files_button,pn.bind(handle_click, csv_files_button.param.clicks)),
+                pn.pane.Markdown("### The Dataframe uploaded:"), pn.pane.DataFrame(intial_dataframe),
+                #pn.pane.Markdown("### The Exposure time DataFrame is :"), pn.pane.DataFrame(exp_df.head()),
+                pn.pane.Markdown("### The DataFrame after merging CycIF data x metadata :"), pn.pane.DataFrame(merged_dataframe.head()),
+            )),
+            ("Quality Control", pn.Column(
+                quality_check(quality_control_df, not_intensities)
+                #pn.pane.Markdown("### The Quality check results are:"), quality_check_results(check_shape, check_no_null, check_all_expected_files_present, check_zero_intensities)
+            )),
+            ("Intensities", pn.Column(
+                pn.pane.Markdown("### The Not Intensities DataFrame after processing is :"), pn.pane.DataFrame(not_intensities_df, height=250),
+                pn.pane.Markdown("### Select Intensities to be included"), updated_intensities,
+                #pn.pane.Markdown("### The Intensities DataFrame"), intensities_df,
+                #pn.pane.Markdown("### The metadata obtained that specifies the localisation:"), pn.pane.DataFrame(mlid.head())
+            )),
+            ("Plots", pn.Column(
+                #pn.pane.Markdown(" ### Nucleus Size Distribution: "), pn.Row(nucleus_size_line_graph_with_histogram, num_of_cell_removal),
+                #pn.pane.Markdown(" ### Nucleus Size Distribution: "), pn.Row(plot1,layout2),
+                #pn.pane.Markdown("### Nucleus Distribution Plot:"), pn.Column(nucleus_size_plot, nucleus_size_graph),
+                pn.pane.Markdown(" ### Intensity Average Plot:"), pn.Row(selected_marker_plot,num_of_cell_removal_intensity ),
+                #pn.Column(pn.Column(column_dropdown, generate_plot_button), quantile_slider, plot),
+                #pn.pane.Markdown("### Cytoplasm Intensity Plot:"), cytoplasm_intensity_plot,
+                #pn.pane.Markdown("### AF555_Cell_Intensity_Average:"), quantile_output_app,
+                #pn.pane.Markdown("### Distribution of AF555_Cell_Intensity_Average with Quantiles:"), quantile_intensity_plot)
+            )),
+),
+    ])
+app.servable()
+if __name__ == "__main__":
+    pn.serve(app, port=5007)

my_modules.py ADDED Viewed

	@@ -0,0 +1,468 @@

+import os
+import numpy as np
+import pandas as pd
+import subprocess
+import os
+import random
+import re
+import pandas as pd
+import numpy as np
+import seaborn as sb
+import matplotlib.pyplot as plt
+import matplotlib.colors as mplc
+import subprocess
+from scipy import signal
+import plotly.figure_factory as ff
+import plotly
+import plotly.graph_objs as go
+from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
+# This function takes in a dataframe, changes the names
+# of the column in various ways, and returns the dataframe.
+# For best accuracy and generalizability, the code uses
+# regular expressions (regex) to find strings for replacement.
+def apply_header_changes(df):
+    # remove lowercase x at beginning of name
+    df.columns = df.columns.str.replace("^x","")
+    # remove space at beginning of name
+    df.columns = df.columns.str.replace("^ ","")
+    # replace space with underscore
+    df.columns = df.columns.str.replace(" ","_")
+    # fix typos
+    df.columns = df.columns.str.replace("AF_AF","AF")
+    # change "Cell Id" into "ID"
+    df.columns = df.columns.str.replace("Cell Id","ID")
+    # if the ID is the index, change "Cell Id" into "ID"
+    df.index.name = "ID"
+    #
+    df.columns = df.columns.str.replace("","")
+    return df
+def apply_df_changes(df):
+    # Remove "@1" after the ID in the index
+    df.index = df.index.str.replace(r'@1$', '')
+    return df
+def compare_headers(expected, actual, name):
+    missing_actual = np.setdiff1d(expected, actual)
+    extra_actual = np.setdiff1d(actual, expected)
+    if len(missing_actual) > 0:
+        #print("WARNING: File '" + name + "' lacks the following expected header(s) after import header reformatting: \n"
+        #      + str(missing_actual))
+        print("WARNING: File '" + name + "' lacks the following expected item(s): \n" + str(missing_actual))
+    if len(extra_actual) > 0:
+        #print("WARNING: '" + name + "' has the following unexpected header(s) after import header reformatting: \n"
+        #      + str(extra_actual))
+        print("WARNING: '" + name + "' has the following unexpected item(s): \n" + str(extra_actual))
+    return None
+def add_metadata_location(row):
+    fc = row['full_column'].lower()
+    if 'cytoplasm' in fc and 'cell' not in fc and 'nucleus' not in fc:
+        return 'cytoplasm'
+    elif 'cell' in fc and 'cytoplasm' not in fc and 'nucleus' not in fc:
+        return 'cell'
+    elif 'nucleus' in fc and 'cell' not in fc and 'cytoplasm' not in fc:
+        return 'nucleus'
+    else:
+        return 'unknown'
+def get_perc(row, cell_type):
+    total = row['stroma'] + row['immune'] + row['cancer']+row['endothelial']
+    return round(row[cell_type]/total *100,1)
+# Divide each marker (and its localisation) by the right exposure setting for each group of samples
+def divide_exp_time(col, exp_col, metadata):
+    exp_time = metadata.loc[metadata['full_column'] == col.name, exp_col].values[0]
+    return col/exp_time
+def do_background_sub(col, df, metadata):
+    #print(col.name)
+    location = metadata.loc[metadata['full_column'] == col.name, 'localisation'].values[0]
+    #print('location = ' + location)
+    channel = metadata.loc[metadata['full_column'] == col.name, 'Channel'].values[0]
+    #print('channel = ' + channel)
+    af_target = metadata.loc[
+        (metadata['Channel']==channel) \
+        & (metadata['localisation']==location) \
+        & (metadata['target_lower'].str.contains(r'^af\d{3}$')),\
+        'full_column'].values[0]
+    return col - df.loc[:,af_target]
+"""
+This function plots distributions. It takes in a string title (title), a list of
+dataframes from which to plot (dfs), a list of dataframe names for the legend
+(names), a list of the desired colors for the plotted samples (colors),
+a string for the x-axis label (x_label), ```a float binwidth for histrogram (bin_size)```,
+a boolean to show the legend or not (legend),
+and the names of the marker(s) to plot (input_labels). If not specified,
+the function will plot all markers in one plot. input_labels can either be a
+single string, e.g., 'my_marker', or a list, e.g., ['my_marker1','my_marker2'].
+The function will create a distribution plot and save it to png. It requires
+a list of items not to be considered as markers when evaluating column names
+(not_markers) to be in memory. It also requires a desired output location of
+the files (output_dir) to already be in memory.
+"""
+def make_distr_plot_per_sample(title, location, dfs, df_names, colors, x_label, legend, xlims = None, markers = ['all'],not_intensities = None):
+    ### GET LIST OF MARKERS TO PLOT ###
+    # Get list of markers to plot if not specified by user, using columns in first df
+    # Writing function(parameter = FILLER) makes that parameter optional when user calls function,
+    # since it is given a default value!
+    if markers == ["all"]:
+        markers = [c for c in dfs[0].columns.values if c not in not_intensities]
+    elif not isinstance(markers, list):
+        markers = [markers]
+    # Make input labels a set to get only unique values, then put back into list
+    markers = list(set(markers))
+    ### GET XLIMS ###
+    if xlims == None:
+        mins = [df.loc[:,markers].min().min() for df in dfs]
+        maxes = [df.loc[:,markers].max().max() for df in dfs]
+        xlims = [min(mins), max(maxes)]
+    if not isinstance(xlims, list):
+        print("Problem - xlmis not list. Exiting method...")
+        return None
+    ### CHECK DATA CAN BE PLOTTED ###
+    # Check for data with only 1 unique value - this will cause error if plotted
+    group_labels = []
+    hist_data = []
+    # Iterate through all dataframes (dfs)
+    for i in range(len(dfs)):
+        # Iterate through all marker labels
+        for f in markers:
+            # If there is only one unique value in the marker data for this dataframe,
+            # you cannot plot a distribution plot. It gives you a linear algebra
+            # singular value matrix error
+            if dfs[i][f].nunique() != 1:
+                # Add df name and marker name to labels list
+                # If we have >1 df, we want to make clear
+                # which legend label is associated with which df
+                if len(df_names) > 1:
+                    group_labels.append(df_names[i]+"_"+f)
+                else:
+                    group_labels.append(f)
+                # add the data to the data list
+                hist_data.append(dfs[i][f])
+    # if no data had >1 unique values, there is nothing to plot
+    if len(group_labels) < 1:
+        print("No markers plotted - all were singular value. Names and markers were " + str(df_names) + ", " + str(markers))
+        return None
+    ### TRANSFORM COLOR ITEMS TO CORRECT TYPE ###
+    if isinstance(colors[0], tuple):
+        colors = ['rgb' + str(color) for color in colors]
+    ### PLOT DATA ###
+    # Create plot
+    fig = ff.create_distplot(hist_data, group_labels, bin_size=0.1,
+        #colors=colors, bin_size=bin_size,  show_rug=False)#show_hist=False,
+        colors=colors, show_rug=False)
+    # Adjust title, font, background color, legend...
+    fig.update_layout(title_text=title, font=dict(size=18),
+        plot_bgcolor = 'white', showlegend = legend)#, legend_x = 3)
+    # Adjust opacity
+    fig.update_traces(opacity=0.6)
+    # Adjust x-axis parameters
+    fig.update_xaxes(title_text = x_label, showline=True, linewidth=2, linecolor='black',
+        tickfont=dict(size=18), range = xlims) # x lims was here
+    # Adjust y-axis parameters
+    fig.update_yaxes(title_text = "Kernel density estimate",showline=True, linewidth=1, linecolor='black',
+        tickfont=dict(size=18))
+    ### SAVE/DISPLAY PLOT ###
+    # Save plot to HTML
+    # plotly.io.write_html(fig, file = output_dir + "/" + title + ".html")
+    # Plot in new tab
+    #plot(fig)
+    # Save to png
+    filename = os.path.join(location, title.replace(" ","_") + ".png")
+    fig.write_image(filename)
+    return None
+    # this could be changed to use recursion and make it 'smarter'
+def shorten_feature_names(long_names):
+    name_dict = dict(zip(long_names,[n.split('_')[0] for n in long_names]))
+    names_lts, long_names, iteration = shorten_feature_names_helper(name_dict, long_names, 1)
+    # names_lts = names long-to-short
+    # names_stl = names stl
+    names_stl = {}
+    for n in names_lts.items():
+        names_stl[n[1]] = n[0]
+    return names_lts, names_stl
+def shorten_feature_names_helper(name_dict, long_names, iteration):
+    #print("\nThis is iteration #"+str(iteration))
+    #print("name_dict is: " + str(name_dict))
+    #print("long_names is: " + str(long_names))
+    ## If the number of unique nicknames == number of long names
+    ## then the work here is done
+    #print('\nCompare lengths: ' + str(len(set(name_dict.values()))) + ", " + str(len(long_names)))
+    #print('set(name_dict.values()): ' + str(set(name_dict.values())))
+    #print('long_names: ' + str(long_names))
+    if len(set(name_dict.values())) == len(long_names):
+        #print('All done!')
+        return name_dict, long_names, iteration
+    ## otherwise, if the number of unique nicknames is not
+    ## equal to the number of long names (must be shorter than),
+    ## then we need to find more unique names
+    iteration += 1
+    nicknames_set = set()
+    non_unique_nicknames = set()
+    # construct set of current nicknames
+    for long_name in long_names:
+        #print('long_name is ' + long_name + ' and non_unique_nicknames set is ' + str(non_unique_nicknames))
+        short_name = name_dict[long_name]
+        if short_name in nicknames_set:
+            non_unique_nicknames.add(short_name)
+        else:
+            nicknames_set.add(short_name)
+    #print('non_unique_nicknames are: ' + str(non_unique_nicknames))
+    # figure out all long names associated
+    # with the non-unique short names
+    trouble_long_names = set()
+    for long_name in long_names:
+        short_name = name_dict[long_name]
+        if short_name in non_unique_nicknames:
+            trouble_long_names.add(long_name)
+    #print('troublesome long names are: ' + str(trouble_long_names))
+    #print('name_dict: ' + str(name_dict))
+    # operate on all names that are associated with
+    # the non-unique short nicknames
+    for long_name in trouble_long_names:
+        #print('trouble long name is: ' + long_name)
+        #print('old nickname is: ' + name_dict[long_name])
+        name_dict[long_name] = '_'.join(long_name.split('_')[0:iteration])
+        #print('new nickname is: ' + name_dict[long_name])
+    shorten_feature_names_helper(name_dict, long_names, iteration)
+    return name_dict, long_names, iteration
+def heatmap_function2(title,
+            data,
+              method, metric, cmap,
+              cbar_kws, xticklabels, save_loc,
+              row_cluster, col_cluster,
+            annotations = {'rows':[],'cols':[]}):
+    sb.set(font_scale= 6.0)
+    # Extract row and column mappings
+    row_mappings = []
+    col_mappings = []
+    for ann in annotations['rows']:
+        row_mappings.append(ann['mapping'])
+    for ann in annotations['cols']:
+        col_mappings.append(ann['mapping'])
+    # If empty lists, convert to None so seaborn accepts
+    # as the row_colors or col_colors objects
+    if len(row_mappings) == 0:
+        row_mappings = None
+    if len(col_mappings) == 0:
+        col_mappings = None
+def heatmap_function(title,
+            data,
+              method, metric, cmap,
+              cbar_kws, xticklabels, save_loc,
+              row_cluster, col_cluster,
+            annotations = {'rows':[],'cols':[]}):
+    sb.set(font_scale= 2.0)
+    # Extract row and column mappings
+    row_mappings = []
+    col_mappings = []
+    for ann in annotations['rows']:
+        row_mappings.append(ann['mapping'])
+    for ann in annotations['cols']:
+        col_mappings.append(ann['mapping'])
+    # If empty lists, convert to None so seaborn accepts
+    # as the row_colors or col_colors objects
+    if len(row_mappings) == 0:
+        row_mappings = None
+    if len(col_mappings) == 0:
+        col_mappings = None
+    # Create clustermap
+    g = sb.clustermap(data = data,
+                  robust = True,
+                  method = method, metric = metric,
+                  cmap = cmap,
+                  row_cluster = row_cluster, col_cluster = col_cluster,
+                  figsize = (40,30),
+                  row_colors=row_mappings, col_colors=col_mappings,
+                      yticklabels = False,
+                     cbar_kws = cbar_kws,
+                     xticklabels = xticklabels)
+    # To rotate slightly the x labels
+    plt.setp(g.ax_heatmap.xaxis.get_majorticklabels(), rotation=45)
+    # Add title
+    g.fig.suptitle(title, fontsize = 60.0)
+    #And now for the legends:
+    # iterate through 'rows', 'cols'
+    for ann_type in annotations.keys():
+        # iterate through each individual annotation feature
+        for ann in annotations[ann_type]:
+            color_dict = ann['dict']
+            handles = []
+            for item in color_dict.keys():
+                h = g.ax_col_dendrogram.bar(0,0, color = color_dict[item], label = item,
+                                           linewidth = 0)
+                handles.append(h)
+            legend = plt.legend(handles = handles, loc = ann['location'], title = ann['label'],
+                               bbox_to_anchor=ann['bbox_to_anchor'],
+                               bbox_transform=plt.gcf().transFigure)
+            ax = plt.gca().add_artist(legend)
+    # Save image
+    filename = os.path.join(save_loc, title.lower().replace(" ","_") + ".png")
+    g.savefig(filename)
+    return None
+# sources -
+#https://stackoverflow.com/questions/27988846/how-to-express-classes-on-the-axis-of-a-heatmap-in-seaborn
+# https://matplotlib.org/3.1.1/tutorials/intermediate/legend_guide.html
+def verify_line_no(filename, lines_read):
+    # Use Linux "wc -l" command to get the number of lines in the unopened file
+    wc = subprocess.check_output(['wc', '-l', filename]).decode("utf-8")
+    # Take that string, turn it into a list, extract the first item,
+    # and make that an int - this is the number of lines in the file
+    wc = int(wc.split()[0])
+    if lines_read != wc:
+        print("WARNING: '" + filename + "' has " + str(wc) +
+            " lines, but imported dataframe has "
+              + str(lines_read) + " (including header).")
+    return None
+def rgb_tuple_from_str(rgb_str):
+    rgb_str = rgb_str.replace("(","").replace(")","").replace(" ","")
+    rgb = list(map(float,rgb_str.split(",")))
+    return tuple(rgb)
+def color_dict_to_df(cd, column_name):
+    df = pd.DataFrame.from_dict(cd, orient = 'index')
+    df['rgb'] = df.apply(lambda row: (np.float64(row[0]), np.float64(row[1]), np.float64(row[2])), axis = 1)
+    df = df.drop(columns = [0,1,2])
+    df['hex'] = df.apply(lambda row: mplc.to_hex(row['rgb']), axis = 1)
+    df[column_name] = df.index
+    return df
+# p-values that are less than or equal to 0.05
+def p_add_star(row):
+    m = [str('{:0.3e}'.format(m)) + "*"
+         if m <= 0.05 \
+         else str('{:0.3e}'.format(m))
+        for m in row ]
+    return pd.Series(m)
+# assigns a specific number of asterisks based on the thresholds
+def p_to_star(row):
+    output  = []
+    for item in row:
+        if item <= 0.001:
+            stars = 3
+        elif item <= 0.01:
+            stars = 2
+        elif item <= 0.05:
+            stars = 1
+        else:
+            stars = 0
+        value = ''
+        for i in range(stars):
+            value += '*'
+        output.append(value)
+    return pd.Series(output)
+def plot_gaussian_distributions(df):
+    # Initialize thresholds list to store all calculated thresholds
+    all_thresholds = []
+    # Iterate over all columns except the first one (assuming the first one is non-numeric or an index)
+    for column in df.columns:
+        # Extract the marker data
+        marker_data = df[column]
+        # Calculating mean and standard deviation for each marker
+        m_mean, m_std = np.mean(marker_data), np.std(marker_data)
+        # Generating x values for the Gaussian curve
+        x_vals = np.linspace(marker_data.min(), marker_data.max(), 100)
+        # Calculating Gaussian distribution curve
+        gaussian_curve = (1 / (m_std * np.sqrt(2 * np.pi))) * np.exp(-(x_vals - m_mean) ** 2 / (2 * m_std ** 2))
+        # Creating figure for Gaussian distribution for each marker
+        fig = go.Figure()
+        fig.add_trace(go.Scatter(x=x_vals, y=gaussian_curve, mode='lines', name=f'{column} Gaussian Distribution'))
+        fig.update_layout(title=f'Gaussian Distribution for {column} Marker')
+        # Calculating thresholds based on each marker's distribution
+        seuil_1sigma = m_mean + m_std
+        seuil_2sigma = m_mean + 2 * m_std
+        seuil_3sigma = m_mean + 3 * m_std
+        # Display the figures with thresholds
+        fig.add_shape(type='line', x0=seuil_1sigma, y0=0, x1=seuil_1sigma, y1=np.max(gaussian_curve),
+                      line=dict(color='red', dash='dash'), name=f'Seuil 1σ: {seuil_1sigma:.2f}')
+        fig.add_shape(type='line', x0=seuil_2sigma, y0=0, x1=seuil_2sigma, y1=np.max(gaussian_curve),
+                      line=dict(color='green', dash='dash'), name=f'Seuil 2σ: {seuil_2sigma:.2f}')
+        fig.add_shape(type='line', x0=seuil_3sigma, y0=0, x1=seuil_3sigma, y1=np.max(gaussian_curve),
+                      line=dict(color='blue', dash='dash'), name=f'Seuil 3σ: {seuil_3sigma:.2f}')
+        # Add markers and values to the plot
+        fig.add_trace(go.Scatter(x=[seuil_1sigma, seuil_2sigma, seuil_3sigma],
+                                 y=[0, 0, 0],
+                                 mode='markers+text',
+                                 text=[f'{seuil_1sigma:.2f}', f'{seuil_2sigma:.2f}', f'{seuil_3sigma:.2f}'],
+                                 textposition="top center",
+                                 marker=dict(size=10, color=['red', 'green', 'blue']),
+                                 name='Threshold Values'))
+        fig.show()
+        # Append thresholds for each marker to the list
+        all_thresholds.append((column, seuil_1sigma, seuil_2sigma, seuil_3sigma))  # Include the column name
+    # Return thresholds for all markers
+    return all_thresholds

stored_variables.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "base_dir": "/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431",
+    "set_path": "test",
+    "ls_samples": ["DD3S1.csv", "DD3S2.csv", "DD3S3.csv", "TMA.csv"],
+    "selected_metadata_files": ["Slide_B_DD1s1.one_1.tif.csv"]
+}