Spaces:

LaboLabrie
/

CycIF

Sleeping

App Files Files Community

CycIF / Step4_ZScore.py

KashyapiNagaHarshitha

Update Step4_ZScore.py

4b494f9 verified over 1 year ago

raw

history blame contribute delete

31 kB

	#!/usr/bin/env python
	# coding: utf-8

	import os
	import random
	import re
	import pandas as pd
	import numpy as np
	import seaborn as sb
	import matplotlib.pyplot as plt
	import matplotlib.colors as mplc
	import subprocess
	import warnings
	from scipy import signal
	from scipy.stats.stats import pearsonr
	import plotly.figure_factory as ff
	import plotly
	import plotly.graph_objs as go
	from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
	import plotly.express as px
	from my_modules import *
	import panel as pn

	#Silence FutureWarnings & UserWarnings
	warnings.filterwarnings('ignore', category= FutureWarning)
	warnings.filterwarnings('ignore', category= UserWarning)


	# ## III.2. *DIRECTORIES

	# In[4]:


	# Set base directory

	##### MAC WORKSTATION #####
	#base_dir = r'/Volumes/LaboLabrie/Projets/OC_TMA_Pejovic/Temp/Zoe/CyCIF_pipeline/'
	###########################

	##### WINDOWS WORKSTATION #####
	#base_dir = r'C:\Users\LaboLabrie\gerz2701\cyCIF-pipeline\Set_B'
	###############################

	##### LOCAL WORKSTATION #####
	#base_dir = r'/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431'
	#############################

	present_dir = os.path.dirname(os.path.realpath(__file__))

	input_path = os.path.join(present_dir, 'wetransfer_data-zip_2024-05-17_1431')
	base_dir = input_path

	#set_name = 'Set_A'
	set_name = 'test'


	# In[5]:
	set_path = set_name
	selected_metadata_files = "['Slide_B_DD1s1.one_1.tif.csv', 'Slide_B_DD1s1.one_2.tif.csv']"
	ls_samples = "['Ashlar_Exposure_Time.csv', 'new_data.csv', 'DD3S1.csv', 'DD3S2.csv', 'DD3S3.csv', 'TMA.csv']"
	print(base_dir)
	print(set_path)
	print(ls_samples)
	print(selected_metadata_files)


	project_name = set_name # Project name
	step_suffix = 'zscore' # Curent part (here part III)
	previous_step_suffix_long = "_bs" # Previous part (here BS NOTEBOOK)

	# Initial input data directory
	input_data_dir = os.path.join(base_dir, project_name + previous_step_suffix_long)

	# ZSCORE/LOG2 output directories
	output_data_dir = os.path.join(base_dir, project_name + "_" + step_suffix)
	# ZSCORE/LOG2 images subdirectory
	output_images_dir = os.path.join(output_data_dir,"images")

	# Data and Metadata directories
	# Metadata directories
	metadata_dir = os.path.join(base_dir, project_name + "_metadata")
	# images subdirectory
	metadata_images_dir = os.path.join(metadata_dir,"images")

	# Create directories if they don't already exist
	for d in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata_dir, metadata_images_dir]:
	if not os.path.exists(d):
	print("Creation of the" , d, "directory...")
	os.makedirs(d)
	else :
	print("The", d, "directory already exists !")

	os.chdir(input_data_dir)


	# In[7]:


	# Verify paths
	print('base_dir :', base_dir)
	print('input_data_dir :', input_data_dir)
	print('output_data_dir :', output_data_dir)
	print('output_images_dir :', output_images_dir)
	print('metadata_dir :', metadata_dir)
	print('metadata_images_dir :', metadata_images_dir)


	# ## III.3. FILES
	#Don't forget to put your data in the projname_data directory !
	# ### III.3.1. METADATA

	# In[8]:


	# Import all metadata we need from the BS chapter

	# METADATA
	filename = "marker_intensity_metadata.csv"
	filename = os.path.join(metadata_dir, filename)

	# Check file exists
	if not os.path.exists(filename):
	print("WARNING: Could not find desired file: "+filename)
	else :
	print("The",filename,"file was imported for further analysis!")

	# Open, read in information
	metadata = pd.read_csv(filename)

	# Verify size with verify_line_no() function in my_modules.py
	#verify_line_no(filename, metadata.shape[0] + 1)

	# Verify headers
	exp_cols = ['Round','Target','Channel','target_lower','full_column','marker','localisation']
	compare_headers(exp_cols, metadata.columns.values, "Marker metadata file")

	metadata = metadata.dropna()
	metadata.head()


	# ### III.3.2. NOT_INTENSITIES

	# In[9]:


	filename = "not_intensities.csv"
	filename = os.path.join(metadata_dir, filename)

	# Check file exists
	if not os.path.exists(filename):
	print("WARNING: Could not find desired file: "+filename)
	else :
	print("The",filename,"file was imported for further analysis!")

	# Open, read in information
	not_intensities = []
	with open(filename, 'r') as fh:
	not_intensities = fh.read().strip().split("\n")
	# take str, strip whitespace, split on new line character

	# Verify size
	print("Verifying data read from file is the correct length...\n")
	#verify_line_no(filename, len(not_intensities))

	# Print to console
	print("not_intensities =\n", not_intensities)
	pd.DataFrame(not_intensities)


	# ### III.3.3. FULL_TO_SHORT_COLUMN_NAMES

	# In[10]:


	filename = "full_to_short_column_names.csv"
	filename = os.path.join(metadata_dir, filename)

	# Check file exists
	if not os.path.exists(filename):
	print("WARNING: Could not find desired file: " + filename)
	else :
	print("The",filename,"file was imported for further analysis!")

	# Open, read in information
	df = pd.read_csv(filename, header = 0)

	# Verify size
	print("Verifying data read from file is the correct length...\n")
	#verify_line_no(filename, df.shape[0] + 1)

	# Turn into dictionary
	full_to_short_names = df.set_index('full_name').T.to_dict('records')[0]

	# CD45 instead of CD45b
	if project_name == 'Slide_A' :
	full_to_short_names['CD45_Cytoplasm_Intensity_Average'] = full_to_short_names.pop('CD45b_Cytoplasm_Intensity_Average')
	full_to_short_names['CD45_Cytoplasm_Intensity_Average'] = 'CD45_Cytoplasm'

	# Print information
	print('full_to_short_names =\n',full_to_short_names)


	# ### III.3.4. SHORT_TO_FULL_COLUMN_NAMES

	# In[11]:


	filename = "short_to_full_column_names.csv"
	filename = os.path.join(metadata_dir, filename)

	# Check file exists
	if not os.path.exists(filename):
	print("WARNING: Could not find desired file: " + filename)
	else :
	print("The",filename,"file was imported for further analysis!")

	# Open, read in information
	df = pd.read_csv(filename, header = 0)

	# Verify size
	print("Verifying data read from file is the correct length...\n")
	#verify_line_no(filename, df.shape[0] + 1)

	# Turn into dictionary
	short_to_full_names = df.set_index('short_name').T.to_dict('records')[0]

	# CD45 instead of CD45b
	if project_name == 'Slide_A' :
	short_to_full_names['CD45_Cytoplasm'] = short_to_full_names.pop('CD45b_Cytoplasm')
	short_to_full_names['CD45_Cytoplasm'] = 'CD45_Cytoplasm_Intensity_Average'

	# Print information
	print('short_to_full_names =\n',short_to_full_names)


	# ### III.3.5. SAMPLES COLORS

	# In[12]:


	filename = "sample_color_data.csv"
	filename = os.path.join(metadata_dir, filename)

	# Check file exists
	if not os.path.exists(filename):
	print("WARNING: Could not find desired file: " + filename)
	else :
	print("The",filename,"file was imported for further analysis!")

	# Open, read in information
	df = pd.read_csv(filename, header = 0)
	df = df.drop(columns = ['hex'])

	# our tuple of float values for rgb, (r, g, b) was read in
	# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
	# substrings and convert them back into floats
	df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)

	# Verify size
	print("Verifying data read from file is the correct length...\n")
	#verify_line_no(filename, df.shape[0] + 1)

	# Turn into dictionary
	sample_color_dict = df.set_index('Sample_ID')['rgb']

	# Print information
	print('sample_color_dict =\n',sample_color_dict)


	# ### III.3.6. CHANNELS COLORS

	# In[13]:


	filename = "channel_color_data.csv"
	filename = os.path.join(metadata_dir, filename)

	# Check file exists
	if not os.path.exists(filename):
	print("WARNING: Could not find desired file: "+filename)
	else :
	print("The",filename,"file was imported for further analysis!")

	# Open, read in information
	df = pd.read_csv(filename, header = 0)
	df = df.drop(columns = ['hex'])

	# our tuple of float values for rgb, (r, g, b) was read in
	# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
	# substrings and convert them back into floats
	df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)

	# Verify size
	print("Verifying data read from file is the correct length...\n")
	#verify_line_no(filename, df.shape[0] + 1)

	# Turn into dictionary
	channel_color_dict = df.set_index('Channel')['rgb']

	# Print information
	print('channel_color_dict =\n',channel_color_dict)


	# ### III.3.7. ROUNDS COLORS

	# In[14]:


	# ROUND
	filename = "round_color_data.csv"
	filename = os.path.join(metadata_dir, filename)

	# Check file exists
	if not os.path.exists(filename):
	print("WARNING: Could not find desired file: "+filename)
	else :
	print("The",filename,"file was imported for further analysis!")

	# Open, read in information
	df = pd.read_csv(filename, header = 0)
	df = df.drop(columns = ['hex'])

	# our tuple of float values for rgb, (r, g, b) was read in
	# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
	# substrings and convert them back into floats
	df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)

	# Verify size
	print("Verifying data read from file is the correct length...\n")
	#verify_line_no(filename, df.shape[0] + 1)

	# Turn into dictionary
	round_color_dict = df.set_index('Round')['rgb']

	# Print information
	print('round_color_dict =\n',round_color_dict)


	# ### III.3.8. CELL TYPES COLORS

	# In[15]:


	data = pd.read_csv(os.path.join(metadata_dir, 'celltype_color_data.csv'))
	data


	# In[16]:


	filename = "celltype_color_data.csv"
	filename = os.path.join(metadata_dir, filename)

	# Check file exists
	if not os.path.exists(filename):
	print("WARNING: Could not find desired file: "+filename)
	else :
	print("The",filename,"file was imported for further analysis!")

	# Open, read in information
	df = pd.read_csv(filename, header = 0)
	#df = df.drop(columns = ['hex'])

	# Assuming the RGB values are already in separate columns 'R', 'G', 'B'
	if all(col in df.columns for col in ['R', 'G', 'B']):
	# Create the 'rgb' column as tuples of floats
	df['rgb'] = list(zip(df['R'], df['G'], df['B']))

	# our tuple of float values for rgb, (r, g, b) was read in
	# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
	# substrings and convert them back into floats
	#df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)

	# Verify size
	print("Verifying data read from file is the correct length...\n")
	#verify_line_no(filename, df.shape[0] + 1)

	# Turn into dictionary
	cell_type_color_dict = df.set_index('cell_type')['rgb']

	# Print information
	print('cell_type_color_dict =\n',cell_type_color_dict)


	# ### III.3.9. CELL SUBTYPES COLORS

	# In[17]:


	df = pd.read_csv(filename)
	df.head()


	# In[18]:


	filename = "cellsubtype_color_data.csv"
	filename = os.path.join(metadata_dir, filename)

	# Check file exists
	if not os.path.exists(filename):
	print("WARNING: Could not find desired file: "+filename)
	else :
	print("The",filename,"file was imported for further analysis!")

	# Open, read in information
	df = pd.read_csv(filename, header = 0)
	df = df.drop(columns = ['hex'])

	# our tuple of float values for rgb, (r, g, b) was read in
	# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
	# substrings and convert them back into floats
	df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)

	# Verify size
	print("Verifying data read from file is the correct length...\n")
	#verify_line_no(filename, df.shape[0] + 1)

	# Turn into dictionary
	cell_subtype_color_dict = df.set_index('cell_subtype')['rgb'].to_dict()

	# Print information
	print('cell_subtype_color_dict =\n',cell_subtype_color_dict)


	# In[19]:


	df = pd.read_csv(filename)
	df.head()


	# ### III.3.10. IMMUNE CHECKPOINT COLORS

	# In[20]:

	filename = "immunecheckpoint_color_data.csv"
	filename = os.path.join(metadata_dir, filename)

	# Check file exists
	if not os.path.exists(filename):
	print("WARNING: Could not find desired file: "+filename)
	else:
	print("The", filename, "file was imported for further analysis!")

	# Open, read in information
	df = pd.read_csv(filename, header=0)
	df = df.drop(columns=['hex'])

	# Convert the 'rgb' column from string to tuple
	df['rgb'] = df['rgb'].apply(rgb_tuple_from_str)

	# Verify size
	print("Verifying data read from file is the correct length...\n")
	#verify_line_no(filename, df.shape[0] + 1)

	# Turn into dictionary
	immune_checkpoint_color_dict = df.set_index('immune_checkpoint')['rgb'].to_dict()

	# Print information
	print('immune_checkpoint_color_dict =\n', immune_checkpoint_color_dict)
	immune_checkpoint_color_df = pd.DataFrame(immune_checkpoint_color_dict)
	immune_checkpoint_color_df


	# ### III.3.10. DATA

	# In[21]:


	# DATA
	# List files in the directory
	# Check if the directory exists
	if os.path.exists(input_data_dir):
	# List files in the directory
	ls_samples = [sample for sample in os.listdir(input_data_dir) if sample.endswith("_bs.csv")]
	print("The following CSV files were detected:")
	print([sample for sample in ls_samples])
	else:
	print(f"The directory {input_data_dir} does not exist.")


	# In[22]:


	# Import all the others files
	dfs = {}

	# Set variable to hold default header values
	# First gather information on expected headers using first file in ls_samples
	# Read in the first row of the file corresponding to the first sample (index = 0) in ls_samples
	df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]) , index_col = 0, nrows = 1)
	expected_headers = df.columns.values
	#print(expected_headers)

	###############################
	# !! This may take a while !! #
	###############################
	for sample in ls_samples:
	file_path = os.path.join(input_data_dir,sample)
	print(file_path)
	try:
	# Read the CSV file
	df = pd.read_csv(file_path, index_col=0)
	# Check if the DataFrame is empty, if so, don't continue trying to process df and remove it

	if not df.empty:
	# Reorder the columns to match the expected headers list
	df = df.reindex(columns=expected_headers)
	print(sample, "file is processed !\n")
	#print(df)

	except pd.errors.EmptyDataError:
	print(f'\nEmpty data error in {sample} file. Removing from analysis...')
	ls_samples.remove(sample)

	# Add df to dfs
	dfs[sample] = df

	#print(dfs)


	# In[23]:


	# Merge dfs into one df
	df = pd.concat(dfs.values(), ignore_index=False , sort = False)
	del dfs
	merged_df = df


	# In[24]:


	merged_df


	# In[25]:


	merged_df_shape = df.shape


	# In[26]:


	merged_df_index =df.index


	# In[27]:


	merged_df_col_values = df.columns.values


	# In[28]:


	# Check for NaN entries (should not be any unless columns do not align)
	# False means no NaN entries
	# True means NaN entries
	merged_df_null_values = df.isnull().any().any()


	# In[29]:


	df.isnull().any().any()


	# ## III.4. MARKERS

	# In[30]:


	# Listing all the markers of interest for downstream analyses
	# !!TODO WITH MARILYNE!!
	markers = [
	'53BP1_Nucleus_Intensity_Average',
	'AR_Nucleus_Intensity_Average',
	'CCNB1_Cell_Intensity_Average',
	'CCND1_Nucleus_Intensity_Average',
	'CCNE_Nucleus_Intensity_Average',
	'CD31_Cytoplasm_Intensity_Average',
	'CKs_Cytoplasm_Intensity_Average',
	'ERa_Nucleus_Intensity_Average',
	'Ecad_Cytoplasm_Intensity_Average',
	'GATA3_Nucleus_Intensity_Average',
	'H3K27_Nucleus_Intensity_Average',
	'H3K4me3_Nucleus_Intensity_Average',
	'HER2_Cytoplasm_Intensity_Average',
	'HSP90_Cell_Intensity_Average',
	'Ki67_Nucleus_Intensity_Average',
	'PAX8_Nucleus_Intensity_Average',
	'PCNA_Nucleus_Intensity_Average',
	'PRg_Nucleus_Intensity_Average',
	'S100b_Cytoplasm_Intensity_Average',
	'TP53_Cell_Intensity_Average',
	'Vimentin_Cytoplasm_Intensity_Average',
	'pAKT_Cytoplasm_Intensity_Average',
	'pATM_Nucleus_Intensity_Average',
	'pATR_Nucleus_Intensity_Average',
	'pERK_Cell_Intensity_Average',
	'pRB_Nucleus_Intensity_Average',
	'pS6_Cytoplasm_Intensity_Average',
	'AXL_Cytoplasm_Intensity_Average',
	'B7H4_Cell_Intensity_Average',
	'CD11c_Cytoplasm_Intensity_Average',
	'CD163_Cytoplasm_Intensity_Average',
	'CD20_Cytoplasm_Intensity_Average',
	'CD31_Cytoplasm_Intensity_Average',
	'CD44_Cytoplasm_Intensity_Average',
	'CD45_Cytoplasm_Intensity_Average',
	'CD45b_Cytoplasm_Intensity_Average',
	'CD4_Cytoplasm_Intensity_Average',
	'CD68_Cytoplasm_Intensity_Average',
	'CD8_Cytoplasm_Intensity_Average',
	'CKs_Cytoplasm_Intensity_Average',
	'ColVI_Cytoplasm_Intensity_Average',
	'Desmin_Cytoplasm_Intensity_Average',
	'Ecad_Cytoplasm_Intensity_Average',
	'FOXP3_Nucleus_Intensity_Average',
	'Fibronectin_Cytoplasm_Intensity_Average',
	'GATA3_Nucleus_Intensity_Average',
	'HLA_Cytoplasm_Intensity_Average',
	'Ki67_Nucleus_Intensity_Average',
	'MMP9_Cytoplasm_Intensity_Average',
	'PD1_Cytoplasm_Intensity_Average',
	'PDGFR_Cytoplasm_Intensity_Average',
	'PDL1_Cytoplasm_Intensity_Average',
	'Sting_Cytoplasm_Intensity_Average',
	'Vimentin_Cytoplasm_Intensity_Average',
	'aSMA_Cytoplasm_Intensity_Average'
	]


	# In[31]:


	# Check if all columns in the markers list are present in the DataFrame
	missing_columns = [col for col in markers if col not in df.columns]
	if missing_columns:
	# If columns are missing that can be because the markers may be present in the other slide
	print(f"The following columns are not present in the DataFrame ({len(missing_columns)} columns missing): \n{missing_columns}\n")
	# Filter the DataFrame to keep only the columns that are in the markers list and also exist in the DataFrame
	intersected_columns = list(set(markers).intersection(df.columns))
	df_markers = df[intersected_columns]
	else:
	# Filter the DataFrame to keep only the columns in the markers list
	df_markers = df[markers]

	initial_df_marker = df_markers
	df_markers.head()


	# In[32]:


	# Rename CD45b into CD45 (Slide A!)
	if project_name == 'Slide_A' :
	df_markers.rename(columns={"CD45b_Cytoplasm_Intensity_Average": "CD45_Cytoplasm_Intensity_Average"}, inplace=True)
	df_markers.columns.values


	# In[33]:


	df_markers.shape


	# In[34]:


	min_values = df_markers.min().tolist()
	min_values


	# In[35]:


	# Keep not_intensities and markers columns
	# Combine both lists
	combined_columns = list(set(markers) \| set(not_intensities))

	# Filter the DataFrame to keep only the combined columns present in both df and combined_columns
	df_markers_not_intensities = df[df.columns.intersection(combined_columns)]


	# In[36]:


	df_markers_not_intensities


	# In[37]:


	df_markers_not_intensities.shape


	# ## III.5. NORMALISATION

	# In[38]:


	df_markers.min().tolist()


	# In[39]:


	'''# LOG2 TRANFORMATION
	#Values need to be higher than 0 for Log2 transformation.
	print("df_marker.shape before normalisation: ", df_markers.shape)
	df_marker_shape_before_norm = df_markers.shape

	# Option 1
	# This step might not be the best approach because in creates pattern in the data.
	# set anything that is below 0 to 0, so that we can do the log transform, +1 to all columns
	#for f in df_markers.columns[~df_markers.columns.isin(not_intensities)]:
	#df_markers.loc[df_markers[f] < 0,f] = 0
	#option2
	# Add the min from min values (from above) +1 to all columns
	#df_markers.loc[:, ~df_markers.columns.isin(not_intensities)] = \
	#df_markers.loc[:,~df_markers.columns.isin(not_intensities)].copy() + 1
	# Add the minimum value + 1 to each column
	# OR'''


	# In[40]:


	min_value = df_markers.min().min()
	print("min value = ", min_value)
	df_markers = df_markers + (np.abs(min_value))

	# +1
	df_markers = df_markers + 1
	df_after_norm = df_markers
	df_marker_shape_after_norm = df_markers.shape
	print("df_markers.shape after normalisation: ", df_markers.shape)
	df_markers.min().tolist()

	# Apply log2
	df_markers.loc[:,~df_markers.columns.isin(not_intensities)] = \
	np.log2(df_markers.loc[:, ~df_markers.columns.isin(not_intensities)])
	print('log2 transform finished')

	df_markers


	# In[75]:


	#main
	pn.extension()

	not_intensities = [] # Add columns to exclude from transformation if any

	# Define transformation functions
	def modify(df):
	min_value = df.min().min()
	df = df + (np.abs(min_value))
	df = df + 1
	df.loc[:, ~df.columns.isin(not_intensities)] = np.log2(df.loc[:, ~df.columns.isin(not_intensities)])
	return df

	def shift(df):
	df.loc[:, ~df.columns.isin(not_intensities)] = np.log2(df.loc[:, ~df.columns.isin(not_intensities)])
	return df

	# Define the panel widgets
	operation = pn.widgets.RadioButtonGroup(name='Operation', options=['Modify', 'Shift'], button_type='success')

	# Define a function to update the DataFrame based on the selected operation
	def update_dataframe(operation):
	df = df_markers.copy()
	if operation == 'Modify':
	modified_df = modify(df)
	elif operation == 'Shift':
	modified_df = shift(df)
	return modified_df.head(30)

	# Create a panel layout
	layout = pn.Column(
	pn.pane.Markdown("### Data Transformation"),
	operation,
	pn.pane.Markdown("### Transformed DataFrame"),
	pn.bind(lambda op: update_dataframe(op), operation)
	)

	#df_after_norm

	df_markers.columns.tolist()

	# Check for NaN entries (should not be any unless columns do not align)
	# False means no NaN entries
	# True means NaN entries
	df_markers.isnull().any().any()

	count_nan_in_df_markers = df_markers.isnull().sum().sum()
	print(count_nan_in_df_markers)


	# ## III.6. Z-SCORE TRANSFORMATION

	# In[49]:


	# Filter the DataFrame df to keep only the columns specified in the not_intensities list
	#df = df.loc[:, not_intensities]
	#df

	# Check if all columns in the markers list are present in the DataFrame
	missing_columns = [col for col in not_intensities if col not in df.columns]
	if missing_columns:
	print(f"The following columns are not present in the DataFrame ({len(missing_columns)} columns missing): \
	\n{missing_columns}")
	# Filter the DataFrame to keep only the columns that are in the markers list and also exist in the DataFrame
	intersected_columns = list(set(not_intensities).intersection(df.columns))
	df = df[intersected_columns]
	else:
	# Filter the DataFrame to keep only the columns in the markers list
	df.loc[:, not_intensities]

	df


	# In[50]:


	df


	# In[51]:


	df_merged = df_markers.merge(df, left_index=True, right_on='ID', how='left')
	df_merged


	# In[52]:


	df_merged.columns.tolist()


	# In[53]:


	# Create a copy, just in case you need to restart the kernel
	df_merged_copy = df_merged


	# In[54]:


	# Filters the rows of the DataFrame df_merged based on the values in the 'Sample_ID' column
	# df_subset will contain a subset of rows from df_merged where the 'Sample_ID' matches the values in the list 'keep' ('TMA.csv' in this case)
	keep = ['TMA.csv']
	df_subset = df_merged.loc[df_merged['Sample_ID'].isin(keep),:].copy()
	df_subset


	# In[55]:

	# Convert the DataFrame to numeric, forcing errors to NaN
	df_numeric = df_subset.apply(pd.to_numeric, errors='coerce')
	# Z-score normalization
	# Z-score the rows (apply() with axis = 1, only perform on intensity data)
	# Apply Z-score normalization only on numeric columns
	df_subset.loc[:, ~df_subset.columns.isin(not_intensities)] = \
	df_numeric.loc[:, ~df_numeric.columns.isin(not_intensities)].apply(
	lambda row: (row - row.median()) / row.std(ddof=0), axis=1)
	# Drop columns with all NaN values (if any)
	df_subset.dropna(how='all', inplace=True, axis=1)

	print('zscore rows finished')
	###############################
	# !! This may take a while !! #
	###############################
	'''df_subset.loc[:,~df_subset.columns.isin(not_intensities)] = \
	df_subset.loc[:,~df_subset.columns.isin(not_intensities)].apply(
	lambda row: (row - row.median())/(row.std(ddof=0)), axis = 1)
	df_subset.dropna(how = 'all', inplace = True, axis = 1)
	print('zscore rows finished')'''


	# In[56]:


	df_subset
	df_numeric = df_merged.apply(pd.to_numeric, errors='coerce')
	# Z-score the rows (apply() with axis = 1, only perform on intensity data)

	###############################
	# !! This may take a while !! #
	###############################
	df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \
	df_numeric.loc[:,~df_numeric.columns.isin(not_intensities)].apply(
	lambda row: (row - row.median())/(row.std(ddof=0)), axis = 1)
	df_merged.dropna(how = 'all', inplace = True, axis = 1)
	print('zscore rows finished')

	'''# Z-score the rows (apply() with axis = 1, only perform on intensity data)

	###############################
	# !! This may take a while !! #
	###############################
	df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \
	df_merged.loc[:,~df_merged.columns.isin(not_intensities)].apply(
	lambda row: (row - row.median())/(row.std(ddof=0)), axis = 1)
	df_merged.dropna(how = 'all', inplace = True, axis = 1)
	print('zscore rows finished')'''


	df_merged


	# In[59]:


	# Ensuring that the selected columns in df have been adjusted or normalized using the median values
	df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \
	df_merged.loc[:,~df_merged.columns.isin(not_intensities)] - df_subset.loc[:,~df_subset.columns.isin(not_intensities)].median()
	df_merged


	# In[60]:


	df_merged_zscore = df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \
	df_merged.loc[:,~df_merged.columns.isin(not_intensities)] / df_subset.loc[:,~df_subset.columns.isin(not_intensities)].std(ddof=0)
	df_merged_zscore


	# In[61]:


	# Check for NaN entries (should not be any unless columns do not align)
	# False means no NaN entries
	# True means NaN entries
	df.isnull().any().any()


	# In[62]:


	quality_control_df = df_merged_zscore


	# In[63]:


	def check_index_format(index_str, ls_samples):
	"""
	Checks if the given index string follows the specified format.

	Args:
	index_str (str): The index string to be checked.
	ls_samples (list): A list of valid sample names.

	Returns:
	bool: True if the index string follows the format, False otherwise.
	"""
	# Split the index string into parts
	parts = index_str.split('_')

	# Check if there are exactly 3 parts
	if len(parts) != 3:
	print(len(parts))
	return False

	# Check if the first part is in ls_samples
	sample_name = parts[0]
	if f'{sample_name}_bs.csv' not in ls_samples:
	print(sample_name)
	return False

	# Check if the second part is in ['cell', 'cytoplasm', 'nucleus']
	location = parts[1]
	valid_locations = ['Cell', 'Cytoplasm', 'Nucleus']
	if location not in valid_locations:
	print(location)
	return False

	# Check if the third part is a number
	try:
	index = int(parts[2])
	except ValueError:
	print(index)
	return False

	# If all checks pass, return True
	return True
	# Let's take a look at a few features to make sure our dataframe is as expected
	def check_format_ofindex(index):
	for index in df.index:
	check_index = check_index_format(index, ls_samples)
	if check_index is False:
	index_format = "Bad"
	return index_format

	index_format = "Good"
	return index_format


	# In[64]:


	import panel as pn
	import pandas as pd

	def quality_check(file, not_intensities):
	# Load the output file
	df = file

	# Check Index
	check_index = check_format_ofindex(df.index)

	# Check Shape
	check_shape = df.shape

	# Check for NaN entries
	check_no_null = df.isnull().any().any()

	mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
	if (mean_intensity == 0).any():
	df = df.loc[mean_intensity > 0, :]
	print("df.shape after removing 0 mean values: ", df.shape)
	check_zero_intensities = f'Shape after removing 0 mean values: {df.shape}'
	else:
	print("No zero intensity values.")
	check_zero_intensities = "No zero intensity values."

	# Create a quality check results table
	quality_check_results_table = pd.DataFrame({
	'Check': ['Index', 'Shape', 'Check for NaN Entries', 'Check for Zero Intensities'],
	'Result': [str(check_index), str(check_shape), str(check_no_null), check_zero_intensities]
	})

	# Create a quality check results component
	quality_check_results_component = pn.Card(
	pn.pane.DataFrame(quality_check_results_table),
	title="Quality Control Results",
	header_background="#2196f3",
	header_color="white",
	)

	return quality_check_results_component


	# In[76]:


	import panel as pn

	# Assuming your DataFrames are already defined as:
	# metadata, merged_df, initial_df_marker, df_markers_not_intensities, df_after_norm,
	# df_markers, df_subset, df_merged_zscore

	# Create widgets and panes
	df_widget = pn.widgets.DataFrame(metadata, name="MetaData")

	# Define the three tabs content

	metadata_tab = pn.Column(
	pn.pane.Markdown("### Sample Metadata"),
	pn.pane.DataFrame(metadata.head()),
	pn.pane.Markdown("### Intial Dataframe"),
	pn.pane.DataFrame(initial_df_marker.head(), width = 1500),
	pn.Row(pn.pane.Markdown("### Shape: "), pn.pane.Markdown(str(merged_df.shape))),
	pn.pane.Markdown("### Merged Dataframe"),
	pn.pane.DataFrame(merged_df.head(), width = 1500),
	pn.Row(pn.pane.Markdown("### Shape: "), pn.pane.Markdown(str(initial_df_marker.shape))),
	pn.pane.Markdown("### Markers and not intensities Dataframe"),
	pn.pane.DataFrame(df_markers_not_intensities.head(), width = 1500),
	pn.Row(pn.pane.Markdown("### Shape: "),
	pn.pane.Markdown(str(df_markers_not_intensities.shape)))
	)

	normalization_tab = pn.Column(
	#pn.pane.Markdown("### Normalisation performed"),
	#pn.pane.DataFrame(df_after_norm.head()),
	#pn.Row(pn.pane.Markdown("### Shape before normalization: ")),
	#pn.pane.Markdown(str(df_marker_shape_before_norm))),
	#pn.Row(pn.pane.Markdown("### Shape after normalization: ")),
	#pn.pane.Markdown(str(df_marker_shape_after_norm))),
	#pn.pane.Markdown("### Performed log 2 transformation"),
	#pn.pane.DataFrame(df_markers.head())
	layout
	)

	zscore_tab = pn.Column(
	#pn.pane.Markdown("### Performed Z-score transformation"),
	#pn.pane.DataFrame(df_subset.head(), width = 1500),
	pn.pane.Markdown("### Z-score transformation finished"),
	pn.pane.DataFrame(df_merged_zscore.head(30), width = 1500)
	)

	quality_control_tab = pn.Column(
	pn.pane.Markdown("### Quality Control"),
	quality_check(quality_control_df, not_intensities)
	)

	# Create the GoldenTemplate
	app3 = pn.template.GoldenTemplate(
	site="Cyc-IF",
	title="Z-Score Computation",
	main=[
	pn.Tabs(
	("Metadata", metadata_tab),
	("Normalization", normalization_tab),
	("Z-Score", zscore_tab),
	("Quality Control", quality_control_tab)
	)
	]
	)

	app3.servable()

	if __name__ == "__main__":
	pn.serve(app3, port=5007)