Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python | |
| # coding: utf-8 | |
| import os | |
| import random | |
| import re | |
| import pandas as pd | |
| import numpy as np | |
| import seaborn as sb | |
| import matplotlib.pyplot as plt | |
| import matplotlib.colors as mplc | |
| import subprocess | |
| import warnings | |
| from scipy import signal | |
| from scipy.stats.stats import pearsonr | |
| import plotly.figure_factory as ff | |
| import plotly | |
| import plotly.graph_objs as go | |
| from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot | |
| import plotly.express as px | |
| from my_modules import * | |
| import panel as pn | |
| #Silence FutureWarnings & UserWarnings | |
| warnings.filterwarnings('ignore', category= FutureWarning) | |
| warnings.filterwarnings('ignore', category= UserWarning) | |
| # ## III.2. *DIRECTORIES | |
| # In[4]: | |
| # Set base directory | |
| ##### MAC WORKSTATION ##### | |
| #base_dir = r'/Volumes/LaboLabrie/Projets/OC_TMA_Pejovic/Temp/Zoe/CyCIF_pipeline/' | |
| ########################### | |
| ##### WINDOWS WORKSTATION ##### | |
| #base_dir = r'C:\Users\LaboLabrie\gerz2701\cyCIF-pipeline\Set_B' | |
| ############################### | |
| ##### LOCAL WORKSTATION ##### | |
| #base_dir = r'/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431' | |
| ############################# | |
| present_dir = os.path.dirname(os.path.realpath(__file__)) | |
| input_path = os.path.join(present_dir, 'wetransfer_data-zip_2024-05-17_1431') | |
| base_dir = input_path | |
| #set_name = 'Set_A' | |
| set_name = 'test' | |
| # In[5]: | |
| set_path = set_name | |
| selected_metadata_files = "['Slide_B_DD1s1.one_1.tif.csv', 'Slide_B_DD1s1.one_2.tif.csv']" | |
| ls_samples = "['Ashlar_Exposure_Time.csv', 'new_data.csv', 'DD3S1.csv', 'DD3S2.csv', 'DD3S3.csv', 'TMA.csv']" | |
| print(base_dir) | |
| print(set_path) | |
| print(ls_samples) | |
| print(selected_metadata_files) | |
| project_name = set_name # Project name | |
| step_suffix = 'zscore' # Curent part (here part III) | |
| previous_step_suffix_long = "_bs" # Previous part (here BS NOTEBOOK) | |
| # Initial input data directory | |
| input_data_dir = os.path.join(base_dir, project_name + previous_step_suffix_long) | |
| # ZSCORE/LOG2 output directories | |
| output_data_dir = os.path.join(base_dir, project_name + "_" + step_suffix) | |
| # ZSCORE/LOG2 images subdirectory | |
| output_images_dir = os.path.join(output_data_dir,"images") | |
| # Data and Metadata directories | |
| # Metadata directories | |
| metadata_dir = os.path.join(base_dir, project_name + "_metadata") | |
| # images subdirectory | |
| metadata_images_dir = os.path.join(metadata_dir,"images") | |
| # Create directories if they don't already exist | |
| for d in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata_dir, metadata_images_dir]: | |
| if not os.path.exists(d): | |
| print("Creation of the" , d, "directory...") | |
| os.makedirs(d) | |
| else : | |
| print("The", d, "directory already exists !") | |
| os.chdir(input_data_dir) | |
| # In[7]: | |
| # Verify paths | |
| print('base_dir :', base_dir) | |
| print('input_data_dir :', input_data_dir) | |
| print('output_data_dir :', output_data_dir) | |
| print('output_images_dir :', output_images_dir) | |
| print('metadata_dir :', metadata_dir) | |
| print('metadata_images_dir :', metadata_images_dir) | |
| # ## III.3. FILES | |
| #Don't forget to put your data in the projname_data directory ! | |
| # ### III.3.1. METADATA | |
| # In[8]: | |
| # Import all metadata we need from the BS chapter | |
| # METADATA | |
| filename = "marker_intensity_metadata.csv" | |
| filename = os.path.join(metadata_dir, filename) | |
| # Check file exists | |
| if not os.path.exists(filename): | |
| print("WARNING: Could not find desired file: "+filename) | |
| else : | |
| print("The",filename,"file was imported for further analysis!") | |
| # Open, read in information | |
| metadata = pd.read_csv(filename) | |
| # Verify size with verify_line_no() function in my_modules.py | |
| #verify_line_no(filename, metadata.shape[0] + 1) | |
| # Verify headers | |
| exp_cols = ['Round','Target','Channel','target_lower','full_column','marker','localisation'] | |
| compare_headers(exp_cols, metadata.columns.values, "Marker metadata file") | |
| metadata = metadata.dropna() | |
| metadata.head() | |
| # ### III.3.2. NOT_INTENSITIES | |
| # In[9]: | |
| filename = "not_intensities.csv" | |
| filename = os.path.join(metadata_dir, filename) | |
| # Check file exists | |
| if not os.path.exists(filename): | |
| print("WARNING: Could not find desired file: "+filename) | |
| else : | |
| print("The",filename,"file was imported for further analysis!") | |
| # Open, read in information | |
| not_intensities = [] | |
| with open(filename, 'r') as fh: | |
| not_intensities = fh.read().strip().split("\n") | |
| # take str, strip whitespace, split on new line character | |
| # Verify size | |
| print("Verifying data read from file is the correct length...\n") | |
| #verify_line_no(filename, len(not_intensities)) | |
| # Print to console | |
| print("not_intensities =\n", not_intensities) | |
| pd.DataFrame(not_intensities) | |
| # ### III.3.3. FULL_TO_SHORT_COLUMN_NAMES | |
| # In[10]: | |
| filename = "full_to_short_column_names.csv" | |
| filename = os.path.join(metadata_dir, filename) | |
| # Check file exists | |
| if not os.path.exists(filename): | |
| print("WARNING: Could not find desired file: " + filename) | |
| else : | |
| print("The",filename,"file was imported for further analysis!") | |
| # Open, read in information | |
| df = pd.read_csv(filename, header = 0) | |
| # Verify size | |
| print("Verifying data read from file is the correct length...\n") | |
| #verify_line_no(filename, df.shape[0] + 1) | |
| # Turn into dictionary | |
| full_to_short_names = df.set_index('full_name').T.to_dict('records')[0] | |
| # CD45 instead of CD45b | |
| if project_name == 'Slide_A' : | |
| full_to_short_names['CD45_Cytoplasm_Intensity_Average'] = full_to_short_names.pop('CD45b_Cytoplasm_Intensity_Average') | |
| full_to_short_names['CD45_Cytoplasm_Intensity_Average'] = 'CD45_Cytoplasm' | |
| # Print information | |
| print('full_to_short_names =\n',full_to_short_names) | |
| # ### III.3.4. SHORT_TO_FULL_COLUMN_NAMES | |
| # In[11]: | |
| filename = "short_to_full_column_names.csv" | |
| filename = os.path.join(metadata_dir, filename) | |
| # Check file exists | |
| if not os.path.exists(filename): | |
| print("WARNING: Could not find desired file: " + filename) | |
| else : | |
| print("The",filename,"file was imported for further analysis!") | |
| # Open, read in information | |
| df = pd.read_csv(filename, header = 0) | |
| # Verify size | |
| print("Verifying data read from file is the correct length...\n") | |
| #verify_line_no(filename, df.shape[0] + 1) | |
| # Turn into dictionary | |
| short_to_full_names = df.set_index('short_name').T.to_dict('records')[0] | |
| # CD45 instead of CD45b | |
| if project_name == 'Slide_A' : | |
| short_to_full_names['CD45_Cytoplasm'] = short_to_full_names.pop('CD45b_Cytoplasm') | |
| short_to_full_names['CD45_Cytoplasm'] = 'CD45_Cytoplasm_Intensity_Average' | |
| # Print information | |
| print('short_to_full_names =\n',short_to_full_names) | |
| # ### III.3.5. SAMPLES COLORS | |
| # In[12]: | |
| filename = "sample_color_data.csv" | |
| filename = os.path.join(metadata_dir, filename) | |
| # Check file exists | |
| if not os.path.exists(filename): | |
| print("WARNING: Could not find desired file: " + filename) | |
| else : | |
| print("The",filename,"file was imported for further analysis!") | |
| # Open, read in information | |
| df = pd.read_csv(filename, header = 0) | |
| df = df.drop(columns = ['hex']) | |
| # our tuple of float values for rgb, (r, g, b) was read in | |
| # as a string '(r, g, b)'. We need to extract the r-, g-, and b- | |
| # substrings and convert them back into floats | |
| df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1) | |
| # Verify size | |
| print("Verifying data read from file is the correct length...\n") | |
| #verify_line_no(filename, df.shape[0] + 1) | |
| # Turn into dictionary | |
| sample_color_dict = df.set_index('Sample_ID')['rgb'] | |
| # Print information | |
| print('sample_color_dict =\n',sample_color_dict) | |
| # ### III.3.6. CHANNELS COLORS | |
| # In[13]: | |
| filename = "channel_color_data.csv" | |
| filename = os.path.join(metadata_dir, filename) | |
| # Check file exists | |
| if not os.path.exists(filename): | |
| print("WARNING: Could not find desired file: "+filename) | |
| else : | |
| print("The",filename,"file was imported for further analysis!") | |
| # Open, read in information | |
| df = pd.read_csv(filename, header = 0) | |
| df = df.drop(columns = ['hex']) | |
| # our tuple of float values for rgb, (r, g, b) was read in | |
| # as a string '(r, g, b)'. We need to extract the r-, g-, and b- | |
| # substrings and convert them back into floats | |
| df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1) | |
| # Verify size | |
| print("Verifying data read from file is the correct length...\n") | |
| #verify_line_no(filename, df.shape[0] + 1) | |
| # Turn into dictionary | |
| channel_color_dict = df.set_index('Channel')['rgb'] | |
| # Print information | |
| print('channel_color_dict =\n',channel_color_dict) | |
| # ### III.3.7. ROUNDS COLORS | |
| # In[14]: | |
| # ROUND | |
| filename = "round_color_data.csv" | |
| filename = os.path.join(metadata_dir, filename) | |
| # Check file exists | |
| if not os.path.exists(filename): | |
| print("WARNING: Could not find desired file: "+filename) | |
| else : | |
| print("The",filename,"file was imported for further analysis!") | |
| # Open, read in information | |
| df = pd.read_csv(filename, header = 0) | |
| df = df.drop(columns = ['hex']) | |
| # our tuple of float values for rgb, (r, g, b) was read in | |
| # as a string '(r, g, b)'. We need to extract the r-, g-, and b- | |
| # substrings and convert them back into floats | |
| df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1) | |
| # Verify size | |
| print("Verifying data read from file is the correct length...\n") | |
| #verify_line_no(filename, df.shape[0] + 1) | |
| # Turn into dictionary | |
| round_color_dict = df.set_index('Round')['rgb'] | |
| # Print information | |
| print('round_color_dict =\n',round_color_dict) | |
| # ### III.3.8. CELL TYPES COLORS | |
| # In[15]: | |
| data = pd.read_csv(os.path.join(metadata_dir, 'celltype_color_data.csv')) | |
| data | |
| # In[16]: | |
| filename = "celltype_color_data.csv" | |
| filename = os.path.join(metadata_dir, filename) | |
| # Check file exists | |
| if not os.path.exists(filename): | |
| print("WARNING: Could not find desired file: "+filename) | |
| else : | |
| print("The",filename,"file was imported for further analysis!") | |
| # Open, read in information | |
| df = pd.read_csv(filename, header = 0) | |
| #df = df.drop(columns = ['hex']) | |
| # Assuming the RGB values are already in separate columns 'R', 'G', 'B' | |
| if all(col in df.columns for col in ['R', 'G', 'B']): | |
| # Create the 'rgb' column as tuples of floats | |
| df['rgb'] = list(zip(df['R'], df['G'], df['B'])) | |
| # our tuple of float values for rgb, (r, g, b) was read in | |
| # as a string '(r, g, b)'. We need to extract the r-, g-, and b- | |
| # substrings and convert them back into floats | |
| #df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1) | |
| # Verify size | |
| print("Verifying data read from file is the correct length...\n") | |
| #verify_line_no(filename, df.shape[0] + 1) | |
| # Turn into dictionary | |
| cell_type_color_dict = df.set_index('cell_type')['rgb'] | |
| # Print information | |
| print('cell_type_color_dict =\n',cell_type_color_dict) | |
| # ### III.3.9. CELL SUBTYPES COLORS | |
| # In[17]: | |
| df = pd.read_csv(filename) | |
| df.head() | |
| # In[18]: | |
| filename = "cellsubtype_color_data.csv" | |
| filename = os.path.join(metadata_dir, filename) | |
| # Check file exists | |
| if not os.path.exists(filename): | |
| print("WARNING: Could not find desired file: "+filename) | |
| else : | |
| print("The",filename,"file was imported for further analysis!") | |
| # Open, read in information | |
| df = pd.read_csv(filename, header = 0) | |
| df = df.drop(columns = ['hex']) | |
| # our tuple of float values for rgb, (r, g, b) was read in | |
| # as a string '(r, g, b)'. We need to extract the r-, g-, and b- | |
| # substrings and convert them back into floats | |
| df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1) | |
| # Verify size | |
| print("Verifying data read from file is the correct length...\n") | |
| #verify_line_no(filename, df.shape[0] + 1) | |
| # Turn into dictionary | |
| cell_subtype_color_dict = df.set_index('cell_subtype')['rgb'].to_dict() | |
| # Print information | |
| print('cell_subtype_color_dict =\n',cell_subtype_color_dict) | |
| # In[19]: | |
| df = pd.read_csv(filename) | |
| df.head() | |
| # ### III.3.10. IMMUNE CHECKPOINT COLORS | |
| # In[20]: | |
| filename = "immunecheckpoint_color_data.csv" | |
| filename = os.path.join(metadata_dir, filename) | |
| # Check file exists | |
| if not os.path.exists(filename): | |
| print("WARNING: Could not find desired file: "+filename) | |
| else: | |
| print("The", filename, "file was imported for further analysis!") | |
| # Open, read in information | |
| df = pd.read_csv(filename, header=0) | |
| df = df.drop(columns=['hex']) | |
| # Convert the 'rgb' column from string to tuple | |
| df['rgb'] = df['rgb'].apply(rgb_tuple_from_str) | |
| # Verify size | |
| print("Verifying data read from file is the correct length...\n") | |
| #verify_line_no(filename, df.shape[0] + 1) | |
| # Turn into dictionary | |
| immune_checkpoint_color_dict = df.set_index('immune_checkpoint')['rgb'].to_dict() | |
| # Print information | |
| print('immune_checkpoint_color_dict =\n', immune_checkpoint_color_dict) | |
| immune_checkpoint_color_df = pd.DataFrame(immune_checkpoint_color_dict) | |
| immune_checkpoint_color_df | |
| # ### III.3.10. DATA | |
| # In[21]: | |
| # DATA | |
| # List files in the directory | |
| # Check if the directory exists | |
| if os.path.exists(input_data_dir): | |
| # List files in the directory | |
| ls_samples = [sample for sample in os.listdir(input_data_dir) if sample.endswith("_bs.csv")] | |
| print("The following CSV files were detected:") | |
| print([sample for sample in ls_samples]) | |
| else: | |
| print(f"The directory {input_data_dir} does not exist.") | |
| # In[22]: | |
| # Import all the others files | |
| dfs = {} | |
| # Set variable to hold default header values | |
| # First gather information on expected headers using first file in ls_samples | |
| # Read in the first row of the file corresponding to the first sample (index = 0) in ls_samples | |
| df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]) , index_col = 0, nrows = 1) | |
| expected_headers = df.columns.values | |
| #print(expected_headers) | |
| ############################### | |
| # !! This may take a while !! # | |
| ############################### | |
| for sample in ls_samples: | |
| file_path = os.path.join(input_data_dir,sample) | |
| print(file_path) | |
| try: | |
| # Read the CSV file | |
| df = pd.read_csv(file_path, index_col=0) | |
| # Check if the DataFrame is empty, if so, don't continue trying to process df and remove it | |
| if not df.empty: | |
| # Reorder the columns to match the expected headers list | |
| df = df.reindex(columns=expected_headers) | |
| print(sample, "file is processed !\n") | |
| #print(df) | |
| except pd.errors.EmptyDataError: | |
| print(f'\nEmpty data error in {sample} file. Removing from analysis...') | |
| ls_samples.remove(sample) | |
| # Add df to dfs | |
| dfs[sample] = df | |
| #print(dfs) | |
| # In[23]: | |
| # Merge dfs into one df | |
| df = pd.concat(dfs.values(), ignore_index=False , sort = False) | |
| del dfs | |
| merged_df = df | |
| # In[24]: | |
| merged_df | |
| # In[25]: | |
| merged_df_shape = df.shape | |
| # In[26]: | |
| merged_df_index =df.index | |
| # In[27]: | |
| merged_df_col_values = df.columns.values | |
| # In[28]: | |
| # Check for NaN entries (should not be any unless columns do not align) | |
| # False means no NaN entries | |
| # True means NaN entries | |
| merged_df_null_values = df.isnull().any().any() | |
| # In[29]: | |
| df.isnull().any().any() | |
| # ## III.4. MARKERS | |
| # In[30]: | |
| # Listing all the markers of interest for downstream analyses | |
| # !!TODO WITH MARILYNE!! | |
| markers = [ | |
| '53BP1_Nucleus_Intensity_Average', | |
| 'AR_Nucleus_Intensity_Average', | |
| 'CCNB1_Cell_Intensity_Average', | |
| 'CCND1_Nucleus_Intensity_Average', | |
| 'CCNE_Nucleus_Intensity_Average', | |
| 'CD31_Cytoplasm_Intensity_Average', | |
| 'CKs_Cytoplasm_Intensity_Average', | |
| 'ERa_Nucleus_Intensity_Average', | |
| 'Ecad_Cytoplasm_Intensity_Average', | |
| 'GATA3_Nucleus_Intensity_Average', | |
| 'H3K27_Nucleus_Intensity_Average', | |
| 'H3K4me3_Nucleus_Intensity_Average', | |
| 'HER2_Cytoplasm_Intensity_Average', | |
| 'HSP90_Cell_Intensity_Average', | |
| 'Ki67_Nucleus_Intensity_Average', | |
| 'PAX8_Nucleus_Intensity_Average', | |
| 'PCNA_Nucleus_Intensity_Average', | |
| 'PRg_Nucleus_Intensity_Average', | |
| 'S100b_Cytoplasm_Intensity_Average', | |
| 'TP53_Cell_Intensity_Average', | |
| 'Vimentin_Cytoplasm_Intensity_Average', | |
| 'pAKT_Cytoplasm_Intensity_Average', | |
| 'pATM_Nucleus_Intensity_Average', | |
| 'pATR_Nucleus_Intensity_Average', | |
| 'pERK_Cell_Intensity_Average', | |
| 'pRB_Nucleus_Intensity_Average', | |
| 'pS6_Cytoplasm_Intensity_Average', | |
| 'AXL_Cytoplasm_Intensity_Average', | |
| 'B7H4_Cell_Intensity_Average', | |
| 'CD11c_Cytoplasm_Intensity_Average', | |
| 'CD163_Cytoplasm_Intensity_Average', | |
| 'CD20_Cytoplasm_Intensity_Average', | |
| 'CD31_Cytoplasm_Intensity_Average', | |
| 'CD44_Cytoplasm_Intensity_Average', | |
| 'CD45_Cytoplasm_Intensity_Average', | |
| 'CD45b_Cytoplasm_Intensity_Average', | |
| 'CD4_Cytoplasm_Intensity_Average', | |
| 'CD68_Cytoplasm_Intensity_Average', | |
| 'CD8_Cytoplasm_Intensity_Average', | |
| 'CKs_Cytoplasm_Intensity_Average', | |
| 'ColVI_Cytoplasm_Intensity_Average', | |
| 'Desmin_Cytoplasm_Intensity_Average', | |
| 'Ecad_Cytoplasm_Intensity_Average', | |
| 'FOXP3_Nucleus_Intensity_Average', | |
| 'Fibronectin_Cytoplasm_Intensity_Average', | |
| 'GATA3_Nucleus_Intensity_Average', | |
| 'HLA_Cytoplasm_Intensity_Average', | |
| 'Ki67_Nucleus_Intensity_Average', | |
| 'MMP9_Cytoplasm_Intensity_Average', | |
| 'PD1_Cytoplasm_Intensity_Average', | |
| 'PDGFR_Cytoplasm_Intensity_Average', | |
| 'PDL1_Cytoplasm_Intensity_Average', | |
| 'Sting_Cytoplasm_Intensity_Average', | |
| 'Vimentin_Cytoplasm_Intensity_Average', | |
| 'aSMA_Cytoplasm_Intensity_Average' | |
| ] | |
| # In[31]: | |
| # Check if all columns in the markers list are present in the DataFrame | |
| missing_columns = [col for col in markers if col not in df.columns] | |
| if missing_columns: | |
| # If columns are missing that can be because the markers may be present in the other slide | |
| print(f"The following columns are not present in the DataFrame ({len(missing_columns)} columns missing): \n{missing_columns}\n") | |
| # Filter the DataFrame to keep only the columns that are in the markers list and also exist in the DataFrame | |
| intersected_columns = list(set(markers).intersection(df.columns)) | |
| df_markers = df[intersected_columns] | |
| else: | |
| # Filter the DataFrame to keep only the columns in the markers list | |
| df_markers = df[markers] | |
| initial_df_marker = df_markers | |
| df_markers.head() | |
| # In[32]: | |
| # Rename CD45b into CD45 (Slide A!) | |
| if project_name == 'Slide_A' : | |
| df_markers.rename(columns={"CD45b_Cytoplasm_Intensity_Average": "CD45_Cytoplasm_Intensity_Average"}, inplace=True) | |
| df_markers.columns.values | |
| # In[33]: | |
| df_markers.shape | |
| # In[34]: | |
| min_values = df_markers.min().tolist() | |
| min_values | |
| # In[35]: | |
| # Keep not_intensities and markers columns | |
| # Combine both lists | |
| combined_columns = list(set(markers) | set(not_intensities)) | |
| # Filter the DataFrame to keep only the combined columns present in both df and combined_columns | |
| df_markers_not_intensities = df[df.columns.intersection(combined_columns)] | |
| # In[36]: | |
| df_markers_not_intensities | |
| # In[37]: | |
| df_markers_not_intensities.shape | |
| # ## III.5. NORMALISATION | |
| # In[38]: | |
| df_markers.min().tolist() | |
| # In[39]: | |
| '''# LOG2 TRANFORMATION | |
| #Values need to be higher than 0 for Log2 transformation. | |
| print("df_marker.shape before normalisation: ", df_markers.shape) | |
| df_marker_shape_before_norm = df_markers.shape | |
| # Option 1 | |
| # This step might not be the best approach because in creates pattern in the data. | |
| # set anything that is below 0 to 0, so that we can do the log transform, +1 to all columns | |
| #for f in df_markers.columns[~df_markers.columns.isin(not_intensities)]: | |
| #df_markers.loc[df_markers[f] < 0,f] = 0 | |
| #option2 | |
| # Add the min from min values (from above) +1 to all columns | |
| #df_markers.loc[:, ~df_markers.columns.isin(not_intensities)] = \ | |
| #df_markers.loc[:,~df_markers.columns.isin(not_intensities)].copy() + 1 | |
| # Add the minimum value + 1 to each column | |
| # OR''' | |
| # In[40]: | |
| min_value = df_markers.min().min() | |
| print("min value = ", min_value) | |
| df_markers = df_markers + (np.abs(min_value)) | |
| # +1 | |
| df_markers = df_markers + 1 | |
| df_after_norm = df_markers | |
| df_marker_shape_after_norm = df_markers.shape | |
| print("df_markers.shape after normalisation: ", df_markers.shape) | |
| df_markers.min().tolist() | |
| # Apply log2 | |
| df_markers.loc[:,~df_markers.columns.isin(not_intensities)] = \ | |
| np.log2(df_markers.loc[:, ~df_markers.columns.isin(not_intensities)]) | |
| print('log2 transform finished') | |
| df_markers | |
| # In[75]: | |
| #main | |
| pn.extension() | |
| not_intensities = [] # Add columns to exclude from transformation if any | |
| # Define transformation functions | |
| def modify(df): | |
| min_value = df.min().min() | |
| df = df + (np.abs(min_value)) | |
| df = df + 1 | |
| df.loc[:, ~df.columns.isin(not_intensities)] = np.log2(df.loc[:, ~df.columns.isin(not_intensities)]) | |
| return df | |
| def shift(df): | |
| df.loc[:, ~df.columns.isin(not_intensities)] = np.log2(df.loc[:, ~df.columns.isin(not_intensities)]) | |
| return df | |
| # Define the panel widgets | |
| operation = pn.widgets.RadioButtonGroup(name='Operation', options=['Modify', 'Shift'], button_type='success') | |
| # Define a function to update the DataFrame based on the selected operation | |
| def update_dataframe(operation): | |
| df = df_markers.copy() | |
| if operation == 'Modify': | |
| modified_df = modify(df) | |
| elif operation == 'Shift': | |
| modified_df = shift(df) | |
| return modified_df.head(30) | |
| # Create a panel layout | |
| layout = pn.Column( | |
| pn.pane.Markdown("### Data Transformation"), | |
| operation, | |
| pn.pane.Markdown("### Transformed DataFrame"), | |
| pn.bind(lambda op: update_dataframe(op), operation) | |
| ) | |
| #df_after_norm | |
| df_markers.columns.tolist() | |
| # Check for NaN entries (should not be any unless columns do not align) | |
| # False means no NaN entries | |
| # True means NaN entries | |
| df_markers.isnull().any().any() | |
| count_nan_in_df_markers = df_markers.isnull().sum().sum() | |
| print(count_nan_in_df_markers) | |
| # ## III.6. Z-SCORE TRANSFORMATION | |
| # In[49]: | |
| # Filter the DataFrame df to keep only the columns specified in the not_intensities list | |
| #df = df.loc[:, not_intensities] | |
| #df | |
| # Check if all columns in the markers list are present in the DataFrame | |
| missing_columns = [col for col in not_intensities if col not in df.columns] | |
| if missing_columns: | |
| print(f"The following columns are not present in the DataFrame ({len(missing_columns)} columns missing): \ | |
| \n{missing_columns}") | |
| # Filter the DataFrame to keep only the columns that are in the markers list and also exist in the DataFrame | |
| intersected_columns = list(set(not_intensities).intersection(df.columns)) | |
| df = df[intersected_columns] | |
| else: | |
| # Filter the DataFrame to keep only the columns in the markers list | |
| df.loc[:, not_intensities] | |
| df | |
| # In[50]: | |
| df | |
| # In[51]: | |
| df_merged = df_markers.merge(df, left_index=True, right_on='ID', how='left') | |
| df_merged | |
| # In[52]: | |
| df_merged.columns.tolist() | |
| # In[53]: | |
| # Create a copy, just in case you need to restart the kernel | |
| df_merged_copy = df_merged | |
| # In[54]: | |
| # Filters the rows of the DataFrame df_merged based on the values in the 'Sample_ID' column | |
| # df_subset will contain a subset of rows from df_merged where the 'Sample_ID' matches the values in the list 'keep' ('TMA.csv' in this case) | |
| keep = ['TMA.csv'] | |
| df_subset = df_merged.loc[df_merged['Sample_ID'].isin(keep),:].copy() | |
| df_subset | |
| # In[55]: | |
| # Convert the DataFrame to numeric, forcing errors to NaN | |
| df_numeric = df_subset.apply(pd.to_numeric, errors='coerce') | |
| # Z-score normalization | |
| # Z-score the rows (apply() with axis = 1, only perform on intensity data) | |
| # Apply Z-score normalization only on numeric columns | |
| df_subset.loc[:, ~df_subset.columns.isin(not_intensities)] = \ | |
| df_numeric.loc[:, ~df_numeric.columns.isin(not_intensities)].apply( | |
| lambda row: (row - row.median()) / row.std(ddof=0), axis=1) | |
| # Drop columns with all NaN values (if any) | |
| df_subset.dropna(how='all', inplace=True, axis=1) | |
| print('zscore rows finished') | |
| ############################### | |
| # !! This may take a while !! # | |
| ############################### | |
| '''df_subset.loc[:,~df_subset.columns.isin(not_intensities)] = \ | |
| df_subset.loc[:,~df_subset.columns.isin(not_intensities)].apply( | |
| lambda row: (row - row.median())/(row.std(ddof=0)), axis = 1) | |
| df_subset.dropna(how = 'all', inplace = True, axis = 1) | |
| print('zscore rows finished')''' | |
| # In[56]: | |
| df_subset | |
| df_numeric = df_merged.apply(pd.to_numeric, errors='coerce') | |
| # Z-score the rows (apply() with axis = 1, only perform on intensity data) | |
| ############################### | |
| # !! This may take a while !! # | |
| ############################### | |
| df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \ | |
| df_numeric.loc[:,~df_numeric.columns.isin(not_intensities)].apply( | |
| lambda row: (row - row.median())/(row.std(ddof=0)), axis = 1) | |
| df_merged.dropna(how = 'all', inplace = True, axis = 1) | |
| print('zscore rows finished') | |
| '''# Z-score the rows (apply() with axis = 1, only perform on intensity data) | |
| ############################### | |
| # !! This may take a while !! # | |
| ############################### | |
| df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \ | |
| df_merged.loc[:,~df_merged.columns.isin(not_intensities)].apply( | |
| lambda row: (row - row.median())/(row.std(ddof=0)), axis = 1) | |
| df_merged.dropna(how = 'all', inplace = True, axis = 1) | |
| print('zscore rows finished')''' | |
| df_merged | |
| # In[59]: | |
| # Ensuring that the selected columns in df have been adjusted or normalized using the median values | |
| df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \ | |
| df_merged.loc[:,~df_merged.columns.isin(not_intensities)] - df_subset.loc[:,~df_subset.columns.isin(not_intensities)].median() | |
| df_merged | |
| # In[60]: | |
| df_merged_zscore = df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \ | |
| df_merged.loc[:,~df_merged.columns.isin(not_intensities)] / df_subset.loc[:,~df_subset.columns.isin(not_intensities)].std(ddof=0) | |
| df_merged_zscore | |
| # In[61]: | |
| # Check for NaN entries (should not be any unless columns do not align) | |
| # False means no NaN entries | |
| # True means NaN entries | |
| df.isnull().any().any() | |
| # In[62]: | |
| quality_control_df = df_merged_zscore | |
| # In[63]: | |
| def check_index_format(index_str, ls_samples): | |
| """ | |
| Checks if the given index string follows the specified format. | |
| Args: | |
| index_str (str): The index string to be checked. | |
| ls_samples (list): A list of valid sample names. | |
| Returns: | |
| bool: True if the index string follows the format, False otherwise. | |
| """ | |
| # Split the index string into parts | |
| parts = index_str.split('_') | |
| # Check if there are exactly 3 parts | |
| if len(parts) != 3: | |
| print(len(parts)) | |
| return False | |
| # Check if the first part is in ls_samples | |
| sample_name = parts[0] | |
| if f'{sample_name}_bs.csv' not in ls_samples: | |
| print(sample_name) | |
| return False | |
| # Check if the second part is in ['cell', 'cytoplasm', 'nucleus'] | |
| location = parts[1] | |
| valid_locations = ['Cell', 'Cytoplasm', 'Nucleus'] | |
| if location not in valid_locations: | |
| print(location) | |
| return False | |
| # Check if the third part is a number | |
| try: | |
| index = int(parts[2]) | |
| except ValueError: | |
| print(index) | |
| return False | |
| # If all checks pass, return True | |
| return True | |
| # Let's take a look at a few features to make sure our dataframe is as expected | |
| def check_format_ofindex(index): | |
| for index in df.index: | |
| check_index = check_index_format(index, ls_samples) | |
| if check_index is False: | |
| index_format = "Bad" | |
| return index_format | |
| index_format = "Good" | |
| return index_format | |
| # In[64]: | |
| import panel as pn | |
| import pandas as pd | |
| def quality_check(file, not_intensities): | |
| # Load the output file | |
| df = file | |
| # Check Index | |
| check_index = check_format_ofindex(df.index) | |
| # Check Shape | |
| check_shape = df.shape | |
| # Check for NaN entries | |
| check_no_null = df.isnull().any().any() | |
| mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1) | |
| if (mean_intensity == 0).any(): | |
| df = df.loc[mean_intensity > 0, :] | |
| print("df.shape after removing 0 mean values: ", df.shape) | |
| check_zero_intensities = f'Shape after removing 0 mean values: {df.shape}' | |
| else: | |
| print("No zero intensity values.") | |
| check_zero_intensities = "No zero intensity values." | |
| # Create a quality check results table | |
| quality_check_results_table = pd.DataFrame({ | |
| 'Check': ['Index', 'Shape', 'Check for NaN Entries', 'Check for Zero Intensities'], | |
| 'Result': [str(check_index), str(check_shape), str(check_no_null), check_zero_intensities] | |
| }) | |
| # Create a quality check results component | |
| quality_check_results_component = pn.Card( | |
| pn.pane.DataFrame(quality_check_results_table), | |
| title="Quality Control Results", | |
| header_background="#2196f3", | |
| header_color="white", | |
| ) | |
| return quality_check_results_component | |
| # In[76]: | |
| import panel as pn | |
| # Assuming your DataFrames are already defined as: | |
| # metadata, merged_df, initial_df_marker, df_markers_not_intensities, df_after_norm, | |
| # df_markers, df_subset, df_merged_zscore | |
| # Create widgets and panes | |
| df_widget = pn.widgets.DataFrame(metadata, name="MetaData") | |
| # Define the three tabs content | |
| metadata_tab = pn.Column( | |
| pn.pane.Markdown("### Sample Metadata"), | |
| pn.pane.DataFrame(metadata.head()), | |
| pn.pane.Markdown("### Intial Dataframe"), | |
| pn.pane.DataFrame(initial_df_marker.head(), width = 1500), | |
| pn.Row(pn.pane.Markdown("### Shape: "), pn.pane.Markdown(str(merged_df.shape))), | |
| pn.pane.Markdown("### Merged Dataframe"), | |
| pn.pane.DataFrame(merged_df.head(), width = 1500), | |
| pn.Row(pn.pane.Markdown("### Shape: "), pn.pane.Markdown(str(initial_df_marker.shape))), | |
| pn.pane.Markdown("### Markers and not intensities Dataframe"), | |
| pn.pane.DataFrame(df_markers_not_intensities.head(), width = 1500), | |
| pn.Row(pn.pane.Markdown("### Shape: "), | |
| pn.pane.Markdown(str(df_markers_not_intensities.shape))) | |
| ) | |
| normalization_tab = pn.Column( | |
| #pn.pane.Markdown("### Normalisation performed"), | |
| #pn.pane.DataFrame(df_after_norm.head()), | |
| #pn.Row(pn.pane.Markdown("### Shape before normalization: ")), | |
| #pn.pane.Markdown(str(df_marker_shape_before_norm))), | |
| #pn.Row(pn.pane.Markdown("### Shape after normalization: ")), | |
| #pn.pane.Markdown(str(df_marker_shape_after_norm))), | |
| #pn.pane.Markdown("### Performed log 2 transformation"), | |
| #pn.pane.DataFrame(df_markers.head()) | |
| layout | |
| ) | |
| zscore_tab = pn.Column( | |
| #pn.pane.Markdown("### Performed Z-score transformation"), | |
| #pn.pane.DataFrame(df_subset.head(), width = 1500), | |
| pn.pane.Markdown("### Z-score transformation finished"), | |
| pn.pane.DataFrame(df_merged_zscore.head(30), width = 1500) | |
| ) | |
| quality_control_tab = pn.Column( | |
| pn.pane.Markdown("### Quality Control"), | |
| quality_check(quality_control_df, not_intensities) | |
| ) | |
| # Create the GoldenTemplate | |
| app3 = pn.template.GoldenTemplate( | |
| site="Cyc-IF", | |
| title="Z-Score Computation", | |
| main=[ | |
| pn.Tabs( | |
| ("Metadata", metadata_tab), | |
| ("Normalization", normalization_tab), | |
| ("Z-Score", zscore_tab), | |
| ("Quality Control", quality_control_tab) | |
| ) | |
| ] | |
| ) | |
| app3.servable() | |
| if __name__ == "__main__": | |
| pn.serve(app3, port=5007) | |