Metadata from ROSMAP Samples
library(stringr) library(dplyr)
library(viridis)
library(gplots)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(reshape2) library(RColorBrewer) library(ggrepel) library(tidyr)
library(DT) library(patchwork) library(corrplot)
library(plotly)
library(htmlwidgets) library(qvalue) # Open sample information file as "sample_data" for TMT specific 610 samples (round 1 and round2) sample_data <- read.csv("2c.traits-ROSMAP610.R1+R2-2xTAMPORcorrected.csv", sep =",", header = TRUE, stringsAsFactors = FALSE) dim(sample_data)
## [1] 610 40
head(sample_data)
## batch.channel projid.ROSMAP study.ROSMAP scaled_to.ROSMAP apoe_genotype ## 1 b01.127N 20271359 ROS ROSMAP e33 ## 2 b01.127C 33477756 MAP ROSMAP e33 ## 3 b01.128N 38967303 MAP ROSMAP e33 ## 4 b01.128C 10100736 ROS ROSMAP e34 ## 5 b01.129N 10298957 ROS ROSMAP e33 ## 6 b01.129C 46291609 MAP ROSMAP e33 ## ApoE4.Dose ApoE.Risk age_first_ad_dx cogdx cogdx_stroke dxpark cogn_global_lv ## 1 0 0 90.00 4 2 2 -0.36 ## 2 0 0 NA 1 2 2 0.00 ## 3 0 0 NA 1 1 2 0.45 ## 4 1 1 NA 1 2 2 0.39 ## 5 0 0 81.65 4 2 1 -1.63 ## 6 0 0 NA 2 2 2 -0.08 ## cts_mmse30_lv age_death educ msex race7 braaksc ceradsc_RADCnonStd gpath ## 1 25 90.00 18 0 1 4 2 0.24 ## 2 28 90.00 16 0 1 3 2 0.71 ## 3 28 84.83 11 0 1 3 4 0.09 ## 4 28 90.00 14 1 1 4 3 0.34 ## 5 10 82.37 24 1 1 3 2 0.85 ## 6 24 87.59 14 0 1 4 2 0.89 ## niareagansc pmi amyloid amyloid_mf dlbdx mglia123_mf mglia23_mf mglia3_mf ## 1 2 18.33 1.17 1.42 2 NA NA NA ## 2 2 4.77 5.40 7.87 2 NA NA NA ## 3 3 6.50 0.00 0.00 0 170.35 3.91 0 ## 4 3 15.17 2.47 5.06 0 NA NA NA ## 5 2 3.93 8.49 13.82 3 NA NA NA ## 6 2 7.42 7.78 18.42 0 NA NA NA ## nft tangles tangles_mf tdp_cs_6reg tdp_st4 brainwt CERAD JohnsonDx ## 1 0.44 3.55 0.02 1.83 2 1138 2 AsymAD ## 2 0.17 2.07 0.00 0.67 1 1110 2 AsymAD ## 3 0.28 3.02 0.00 0.17 1 1250 0 Control ## 4 0.71 4.95 0.00 0.00 0 1130 1 AsymAD ## 5 0.43 5.12 0.00 0.00 0 1300 2 AD ## 6 0.48 3.40 0.03 0.00 0 1192 2 AsymAD ## Batch GIS BatchColor tissue ## 1 ROSMAP.R1.400 <NA> turquoise dorsolateral prefrontal cortex ## 2 ROSMAP.R1.400 <NA> turquoise dorsolateral prefrontal cortex ## 3 ROSMAP.R1.400 GIS turquoise dorsolateral prefrontal cortex ## 4 ROSMAP.R1.400 <NA> turquoise dorsolateral prefrontal cortex ## 5 ROSMAP.R1.400 <NA> turquoise dorsolateral prefrontal cortex ## 6 ROSMAP.R1.400 <NA> turquoise dorsolateral prefrontal cortex
# Define the sex_type (sex) based on the msex value sample_data$sex <- with(sample_data, ifelse(msex == 0, "female", ifelse(msex == 1, "male", "NA"))) # Create a new categorical column based on cts_mmse30_lv sample_data$cts_mmse30_cat <- ifelse(sample_data$cts_mmse30_lv <= 23, "0", "1") dim(sample_data)
## [1] 610 42
head(sample_data)
## batch.channel projid.ROSMAP study.ROSMAP scaled_to.ROSMAP apoe_genotype ## 1 b01.127N 20271359 ROS ROSMAP e33 ## 2 b01.127C 33477756 MAP ROSMAP e33 ## 3 b01.128N 38967303 MAP ROSMAP e33 ## 4 b01.128C 10100736 ROS ROSMAP e34 ## 5 b01.129N 10298957 ROS ROSMAP e33 ## 6 b01.129C 46291609 MAP ROSMAP e33 ## ApoE4.Dose ApoE.Risk age_first_ad_dx cogdx cogdx_stroke dxpark cogn_global_lv ## 1 0 0 90.00 4 2 2 -0.36 ## 2 0 0 NA 1 2 2 0.00 ## 3 0 0 NA 1 1 2 0.45 ## 4 1 1 NA 1 2 2 0.39 ## 5 0 0 81.65 4 2 1 -1.63 ## 6 0 0 NA 2 2 2 -0.08 ## cts_mmse30_lv age_death educ msex race7 braaksc ceradsc_RADCnonStd gpath ## 1 25 90.00 18 0 1 4 2 0.24 ## 2 28 90.00 16 0 1 3 2 0.71 ## 3 28 84.83 11 0 1 3 4 0.09 ## 4 28 90.00 14 1 1 4 3 0.34 ## 5 10 82.37 24 1 1 3 2 0.85 ## 6 24 87.59 14 0 1 4 2 0.89 ## niareagansc pmi amyloid amyloid_mf dlbdx mglia123_mf mglia23_mf mglia3_mf ## 1 2 18.33 1.17 1.42 2 NA NA NA ## 2 2 4.77 5.40 7.87 2 NA NA NA ## 3 3 6.50 0.00 0.00 0 170.35 3.91 0 ## 4 3 15.17 2.47 5.06 0 NA NA NA ## 5 2 3.93 8.49 13.82 3 NA NA NA ## 6 2 7.42 7.78 18.42 0 NA NA NA ## nft tangles tangles_mf tdp_cs_6reg tdp_st4 brainwt CERAD JohnsonDx ## 1 0.44 3.55 0.02 1.83 2 1138 2 AsymAD ## 2 0.17 2.07 0.00 0.67 1 1110 2 AsymAD ## 3 0.28 3.02 0.00 0.17 1 1250 0 Control ## 4 0.71 4.95 0.00 0.00 0 1130 1 AsymAD ## 5 0.43 5.12 0.00 0.00 0 1300 2 AD ## 6 0.48 3.40 0.03 0.00 0 1192 2 AsymAD ## Batch GIS BatchColor tissue sex ## 1 ROSMAP.R1.400 <NA> turquoise dorsolateral prefrontal cortex female ## 2 ROSMAP.R1.400 <NA> turquoise dorsolateral prefrontal cortex female ## 3 ROSMAP.R1.400 GIS turquoise dorsolateral prefrontal cortex female ## 4 ROSMAP.R1.400 <NA> turquoise dorsolateral prefrontal cortex male ## 5 ROSMAP.R1.400 <NA> turquoise dorsolateral prefrontal cortex male ## 6 ROSMAP.R1.400 <NA> turquoise dorsolateral prefrontal cortex female ## cts_mmse30_cat ## 1 1 ## 2 1 ## 3 1 ## 4 1 ## 5 0 ## 6 1
################################################################################################################################################## # Select a sub-set of the sample (Tissue: DLPFC) from sample_data file sample_data_DLPFC <- filter(sample_data, tissue == "dorsolateral prefrontal cortex") dim(sample_data_DLPFC)
## [1] 610 42
# Read the outliers file outliers <- read.table("outliers.txt", header = TRUE) # Extract the IDs to be removed ids_to_remove <- outliers$Outliers ids_to_remove
## [1] "b22.130N" "b52.130N"
# Use the 'subset' function to remove rows with specified IDs (outliers) from the sample description file with CT samples sample_data_DLPFC <- subset(sample_data_DLPFC, !(batch.channel %in% ids_to_remove)) dim(sample_data_DLPFC)
## [1] 608 42
################################################################################################################################################## # Create a subset of the DLPFC sample description file with control samples only sample_data_DLPFC_contorl <- sample_data_DLPFC[grep("Control", sample_data_DLPFC$JohnsonDx), ] dim(sample_data_DLPFC_contorl)
## [1] 114 42
# Create a subset of the control sample description file with female samples only #sample_data_DLPFC_contorl_female <- sample_data_DLPFC_contorl[grep("female", sample_data_DLPFC_contorl$sex), ] #dim(sample_data_DLPFC_contorl_female) # Create a subset of the control sample description file with male samples only #sample_data_DLPFC_contorl_male <- sample_data_DLPFC_contorl[sample_data_DLPFC_contorl$sex == "male", ] #dim(sample_data_DLPFC_contorl_male) ################################################################################################################################################## # Create a subset of the DLPFC sample description file with AD samples only sample_data_DLPFC_AD <- sample_data_DLPFC[sample_data_DLPFC$JohnsonDx == "AD", ] dim(sample_data_DLPFC_AD)
## [1] 175 42
# Create a subset of the AD sample description file with female samples only #sample_data_DLPFC_AD_female <- sample_data_DLPFC_AD[grep("female", sample_data_DLPFC_AD$sex), ] #dim(sample_data_DLPFC_AD_female) # Create a subset of the AD sample description file with male samples only #sample_data_DLPFC_AD_male <- sample_data_DLPFC_AD[sample_data_DLPFC_AD$sex == "male", ] #dim(sample_data_DLPFC_AD_male) ################################################################################################################################################## # Create a subset of the DLPFC sample description file with AsymAD samples only sample_data_DLPFC_AsymAD <- sample_data_DLPFC[grep("AsymAD", sample_data_DLPFC$JohnsonDx), ] dim(sample_data_DLPFC_AsymAD)
## [1] 220 42
# Create a subset of the AD sample description file with female samples only #sample_data_DLPFC_AsymAD_female <- sample_data_DLPFC_AsymAD[grep("female", sample_data_DLPFC_AsymAD$sex), ] #dim(sample_data_DLPFC_AsymAD_female) # Create a subset of the AD sample description file with male samples only #sample_data_DLPFC_AsymAD_male <- sample_data_DLPFC_AsymAD[sample_data_DLPFC_AsymAD$sex == "male", ] #dim(sample_data_DLPFC_AsymAD_male) ################################################################################################################################################## # Merge both datasets by stacking rows sample_data_DLPFC_cases <- rbind(sample_data_DLPFC_AD, sample_data_DLPFC_AsymAD) # Check the dimensions to ensure it merged correctly dim(sample_data_DLPFC_cases)
## [1] 395 42
################################################################################################################################################## # Merge CT, AD and AsymAD altogether sample_data_DLPFC_CTADAsymAD <- rbind(sample_data_DLPFC_contorl, sample_data_DLPFC_cases) dim(sample_data_DLPFC_CTADAsymAD)
## [1] 509 42
head(sample_data_DLPFC_CTADAsymAD)
## batch.channel projid.ROSMAP study.ROSMAP scaled_to.ROSMAP apoe_genotype ## 3 b01.128N 38967303 MAP ROSMAP e33 ## 8 b01.130C 65736039 MAP ROSMAP e33 ## 12 b02.128C 67185070 MAP ROSMAP e23 ## 23 b03.130N 39484737 MAP ROSMAP e33 ## 28 b04.128C 70625336 MAP ROSMAP e34 ## 34 b05.127C 21001771 ROS ROSMAP e23 ## ApoE4.Dose ApoE.Risk age_first_ad_dx cogdx cogdx_stroke dxpark ## 3 0 0 NA 1 1 2 ## 8 0 0 NA 2 2 2 ## 12 0 -1 NA 1 2 2 ## 23 0 0 NA 3 2 2 ## 28 1 1 NA 1 1 2 ## 34 0 -1 NA 1 2 2 ## cogn_global_lv cts_mmse30_lv age_death educ msex race7 braaksc ## 3 0.45 28 84.83 11 0 1 3 ## 8 -0.64 27 90.00 17 1 1 3 ## 12 0.42 28 77.80 12 0 1 3 ## 23 -0.52 27 86.29 13 1 1 1 ## 28 -0.32 28 81.50 11 0 1 2 ## 34 0.43 30 77.31 21 0 1 1 ## ceradsc_RADCnonStd gpath niareagansc pmi amyloid amyloid_mf dlbdx ## 3 4 0.09 3 6.50 0.00 0.0 0 ## 8 4 0.07 3 7.75 0.40 0.0 2 ## 12 4 0.03 3 9.83 0.15 0.0 0 ## 23 3 0.04 3 8.73 0.16 1.3 0 ## 28 4 0.01 3 3.65 0.00 0.0 0 ## 34 4 0.01 3 5.83 0.00 0.0 0 ## mglia123_mf mglia23_mf mglia3_mf nft tangles tangles_mf tdp_cs_6reg tdp_st4 ## 3 170.35 3.91 0.00 0.28 3.02 0 0.17 1 ## 8 151.93 0.00 0.00 0.21 1.45 0 0.83 1 ## 12 NA NA NA 0.08 0.63 0 0.00 0 ## 23 203.15 4.67 0.51 0.03 0.38 0 0.00 0 ## 28 162.53 10.14 1.39 0.04 0.47 0 0.50 2 ## 34 NA NA NA 0.02 2.20 0 0.00 0 ## brainwt CERAD JohnsonDx Batch GIS BatchColor ## 3 1250 0 Control ROSMAP.R1.400 GIS turquoise ## 8 1103 0 Control ROSMAP.R1.400 GIS turquoise ## 12 1325 0 Control ROSMAP.R1.400 GIS turquoise ## 23 999 1 Control ROSMAP.R1.400 GIS turquoise ## 28 1146 0 Control ROSMAP.R1.400 GIS turquoise ## 34 1204 0 Control ROSMAP.R1.400 GIS turquoise ## tissue sex cts_mmse30_cat ## 3 dorsolateral prefrontal cortex female 1 ## 8 dorsolateral prefrontal cortex male 1 ## 12 dorsolateral prefrontal cortex female 1 ## 23 dorsolateral prefrontal cortex male 1 ## 28 dorsolateral prefrontal cortex female 1 ## 34 dorsolateral prefrontal cortex female 1
Open Subdomain-specific Zscore for CT, AD and AsymAD Individuals (Male and Female Separately):
################################################################################################################################################## ## CT # Open Subdomain-specific Zscore distribution files for CT male Subdomain_zscore_CT_male <- read.delim("/Users/poddea/Desktop/ROSMAP_data_100623/TMT_proteomics/Round2/Subdomain_specific_analysis/subdomain_zscore_CT_male_summary.txt", header = TRUE, sep = "\t"); #Subdomain_zscore_CT_male <- read.delim("/Users/poddea/Desktop/ROSMAP_data_100623/RNASeq_Harmonization_Study/Biodomain_mapping/Subdomain/sig_Subdomain_CT_male.txt", header = TRUE, sep = "\t"); # Ensure column names to` match `IDs` in `zscore_` files Subdomain_zscore_CT_male <- Subdomain_zscore_CT_male %>% rename_with(~ gsub("X", "", .), starts_with("X")) dim(Subdomain_zscore_CT_male)
## [1] 46 129
################################################################################################################################################## # Open Subdomain-specific Zscore distribution files for CT male Subdomain_zscore_CT_female <- read.delim("/Users/poddea/Desktop/ROSMAP_data_100623/TMT_proteomics/Round2/Subdomain_specific_analysis/Subdomain_zscore_CT_female_summary.txt", header = TRUE, sep = "\t"); #Subdomain_zscore_CT_female <- read.delim("/Users/poddea/Desktop/ROSMAP_data_100623/RNASeq_Harmonization_Study/Biodomain_mapping/Subdomain/subdomain_specific_analysis/sig_Subdomain_CT_female.txt", header = TRUE, sep = "\t"); # Ensure column names to` match `IDs` in `zscore_` files Subdomain_zscore_CT_female <- Subdomain_zscore_CT_female %>% rename_with(~ gsub("X", "", .), starts_with("X")) dim(Subdomain_zscore_CT_female)
## [1] 68 129
################################################################################################################################################## # Merge both datasets by stacking rows Subdomain_zscore_CT <- rbind(Subdomain_zscore_CT_male, Subdomain_zscore_CT_female) # Check the dimensions to ensure it merged correctly dim(Subdomain_zscore_CT)
## [1] 114 129
################################################################################################################################################## ################################################################################################################################################## ## AD # Open Subdomain-specific Zscore distribution files for AD male Subdomain_zscore_AD_male <- read.delim("/Users/poddea/Desktop/ROSMAP_data_100623/TMT_proteomics/Round2/Subdomain_specific_analysis/Subdomain_zscore_AD_male_summary.txt", header = TRUE, sep = "\t"); #Subdomain_zscore_AD_male <- read.delim("/Users/poddea/Desktop/ROSMAP_data_100623/RNASeq_Harmonization_Study/Biodomain_mapping/Subdomain/subdomain_specific_analysis/sig_Subdomain_CT_male.txt", header = TRUE, sep = "\t"); # Ensure column names to` match `IDs` in `zscore_` files Subdomain_zscore_AD_male <- Subdomain_zscore_AD_male %>% rename_with(~ gsub("X", "", .), starts_with("X")) dim(Subdomain_zscore_AD_male)
## [1] 47 129
################################################################################################################################################## # Open Subdomain-specific Zscore distribution files for AD male Subdomain_zscore_AD_female <- read.delim("/Users/poddea/Desktop/ROSMAP_data_100623/TMT_proteomics/Round2/Subdomain_specific_analysis/Subdomain_zscore_AD_female_summary.txt", header = TRUE, sep = "\t"); #Subdomain_zscore_AD_female <- read.delim("/Users/poddea/Desktop/ROSMAP_data_100623/RNASeq_Harmonization_Study/Biodomain_mapping/Subdomain/subdomain_specific_analysis/sig_Subdomain_CT_female.txt", header = TRUE, sep = "\t"); # Ensure column names to` match `IDs` in `zscore_` files Subdomain_zscore_AD_female <- Subdomain_zscore_AD_female %>% rename_with(~ gsub("X", "", .), starts_with("X")) dim(Subdomain_zscore_AD_female)
## [1] 127 129
################################################################################################################################################## # Merge both datasets by stacking rows Subdomain_zscore_AD <- rbind(Subdomain_zscore_AD_male, Subdomain_zscore_AD_female) # Check the dimensions to ensure it merged correctly dim(Subdomain_zscore_AD)
## [1] 174 129
################################################################################################################################################## ################################################################################################################################################## ## AsymAD # Open Subdomain-specific Zscore distribution files for AD male Subdomain_zscore_AsymAD_male <- read.delim("/Users/poddea/Desktop/ROSMAP_data_100623/TMT_proteomics/Round2/Subdomain_specific_analysis/Subdomain_zscore_AsymAD_male_summary.txt", header = TRUE, sep = "\t"); #Subdomain_zscore_AsymAD_male <- read.delim("/Users/poddea/Desktop/ROSMAP_data_100623/RNASeq_Harmonization_Study/Biodomain_mapping/Subdomain/subdomain_specific_analysis/sig_Subdomain_AsymAD_male.txt", header = TRUE, sep = "\t"); # Ensure column names to` match `IDs` in `zscore_` files Subdomain_zscore_AsymAD_male <- Subdomain_zscore_AsymAD_male %>% rename_with(~ gsub("X", "", .), starts_with("X")) dim(Subdomain_zscore_AsymAD_male)
## [1] 61 129
################################################################################################################################################## # Open Subdomain-specific Zscore distribution files for AsymAD male Subdomain_zscore_AsymAD_female <- read.delim("/Users/poddea/Desktop/ROSMAP_data_100623/TMT_proteomics/Round2/Subdomain_specific_analysis/Subdomain_zscore_AsymAD_female_summary.txt", header = TRUE, sep = "\t"); #Subdomain_zscore_AsymAD_female <- read.delim("/Users/poddea/Desktop/ROSMAP_data_100623/RNASeq_Harmonization_Study/Biodomain_mapping/Subdomain/subdomain_specific_analysis/sig_Subdomain_AsymAD_female.txt", header = TRUE, sep = "\t"); # Ensure column names to` match `IDs` in `zscore_` files Subdomain_zscore_AsymAD_female <- Subdomain_zscore_AsymAD_female %>% rename_with(~ gsub("X", "", .), starts_with("X")) dim(Subdomain_zscore_AsymAD_female)
## [1] 159 129
################################################################################################################################################## # Merge both datasets by stacking rows Subdomain_zscore_AsymAD <- rbind(Subdomain_zscore_AsymAD_male, Subdomain_zscore_AsymAD_female) # Check the dimensions to ensure it merged correctly dim(Subdomain_zscore_AsymAD)
## [1] 220 129
################################################################################################################################################## ################################################################################################################################################## ## Cases - AD + AsymAD # Merge both datasets by stacking rows Subdomain_zscore_cases <- rbind(Subdomain_zscore_AD, Subdomain_zscore_AsymAD) # Check the dimensions to ensure it merged correctly dim(Subdomain_zscore_cases)
## [1] 394 129
################################################################################################################################################## ################################################################################################################################################## # Merge CT, AD and AsymAD specific Z scores all together Subdomain_zscore_CTADAsymAD <- rbind(Subdomain_zscore_CT, Subdomain_zscore_cases) dim(Subdomain_zscore_CTADAsymAD)
## [1] 508 129
#head(Subdomain_zscore_CTADAsymAD)
Correlation of Clinical Features with Subdomain-specific Zscore (CT, AD and AsymAD all together):
library(ggplot2) library(dplyr) library(viridis) library(ggpubr) theme_set(theme_pubr()) # Merge CT, AD and AsymAD specific Z scores and clinical parameters altogether merged_data_CTADAsymAD <- Subdomain_zscore_CTADAsymAD %>% inner_join(sample_data_DLPFC_CTADAsymAD %>% select(batch.channel, ApoE4.Dose, braaksc, ceradsc_RADCnonStd, cts_mmse30_lv, cts_mmse30_cat, JohnsonDx, sex), by = c("Sample_ID" = "batch.channel")) %>% mutate(Group = paste(JohnsonDx, sex, sep = "_")) # Combine JohnsonDx and sex dim(merged_data_CTADAsymAD)
## [1] 508 137
write.table(merged_data_CTADAsymAD, "merged_data_CTADAsymAD.txt", sep = "\t", quote = FALSE, row.names = FALSE) #head(merged_data_CTADAsymAD) ################################################################################################################################################## # Define custom colors for each group group_colors <- c("AD_male" = "mediumblue", "AD_female" = "tomato", "AsymAD_male" = "dodgerblue", "AsymAD_female" = "coral", "Control_male" = "skyblue", "Control_female" = "lightpink") # Calculate 1st and 99th percentile from CT_male and CT_female #ct_values <- merged_data_CTADAsymAD %>% #filter(Group %in% c("CT_male", "CT_female")) %>% #pull(AM_ac_4_Zscore) #percentile_1 <- quantile(ct_values, probs = 0.01, na.rm = TRUE) # 1st percentile #percentile_99 <- quantile(ct_values, probs = 0.99, na.rm = TRUE) # 99th percentile # Create the box plot for Braak Stage ggplot(merged_data_CTADAsymAD, aes(x = factor(braaksc), y = AM_ac_4_Zscore, fill = as.numeric(braaksc))) + geom_boxplot(outlier.shape = NA, alpha = 0.7) + # Box plot with transparent fill geom_jitter(aes(color = Group), width = 0.2, size = 3, alpha = 0.8) + # Jittered points #geom_hline(yintercept = percentile_1, linetype = "dashed", color = "black", size = 1) + # 1st percentile line #geom_hline(yintercept = percentile_99, linetype = "dashed", color = "black", size = 1) + # 99th percentile line scale_color_manual(values = group_colors) + # Assign custom colors to Group scale_fill_gradient(low = "#D8BFD8", high = "#4B0082", guide = guide_colorbar(title = "Braak")) + # Purple gradient + labs(title = "APP Metabolism: amyloid-beta clearance Zscore Distribution by Braak Stage", x = "BRAAK", y = "Z-score", color = "Group", fill = "BRAAK") + theme_minimal(base_size = 16) + theme( text = element_text(size = 24), axis.text.x = element_text(size = 24), axis.text.y = element_text(size = 24), axis.title = element_text(size = 24), legend.position = "right" )

################################################################################ # Fit a Linear Model (Numeric BRAAK as Continuous Proxy) lm_model <- lm(AM_ac_4_Zscore ~ as.numeric(braaksc), data = merged_data_CTADAsymAD) summary(lm_model)
## ## Call: ## lm(formula = AM_ac_4_Zscore ~ as.numeric(braaksc), data = merged_data_CTADAsymAD) ## ## Residuals: ## Min 1Q Median 3Q Max ## -5.2396 -1.1840 -0.0376 0.9991 10.9946 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) -0.96475 0.26481 -3.643 0.000297 *** ## as.numeric(braaksc) 0.51475 0.06871 7.492 3.05e-13 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 1.826 on 506 degrees of freedom ## Multiple R-squared: 0.09985, Adjusted R-squared: 0.09807 ## F-statistic: 56.13 on 1 and 506 DF, p-value: 3.05e-13
# To get the correlation coefficient from R² #sqrt(0.09614) # ≈ 0.310 #Since the slope is positive, the correlation is: # correlation ≈ 0.310 ################################################################################ # Create the box plot for ApoE4.Dose Stage ggplot( merged_data_CTADAsymAD %>% filter(!is.na(ApoE4.Dose)), # ⬅️ Exclude NA ApoE4.Dose aes(x = factor(ApoE4.Dose), y = AM_ac_4_Zscore, fill = as.numeric(ApoE4.Dose))) + geom_boxplot(outlier.shape = NA, alpha = 0.7) + # Box plot with transparent fill geom_jitter(aes(color = Group), width = 0.2, size = 3, alpha = 0.8) + # Jittered points #geom_hline(yintercept = percentile_1, linetype = "dashed", color = "black", size = 1) + # 1st percentile line #geom_hline(yintercept = percentile_99, linetype = "dashed", color = "black", size = 1) + # 99th percentile line scale_color_manual(values = group_colors) + # Assign custom colors to Group scale_fill_gradient(low = "#D8BFD8", high = "#4B0082", guide = guide_colorbar(title = "ApoE4")) + # Purple gradient + labs(title = "APP Metabolism: amyloid-beta clearance Zscore Distribution by ApoE4 Dose", x = "APOE4", y = "Z-score", color = "Group", fill = "ApoE4") + theme_minimal(base_size = 16) + theme( text = element_text(size = 24), axis.text.x = element_text(size = 24), axis.text.y = element_text(size = 24), axis.title = element_text(size = 24), legend.position = "right" )

################################################################################ # Create the box plot for ceradsc_RADCnonStd ggplot(merged_data_CTADAsymAD, aes(x = factor(ceradsc_RADCnonStd), y = AM_ac_4_Zscore, fill = as.numeric(ceradsc_RADCnonStd))) + geom_boxplot(outlier.shape = NA, alpha = 0.7) + # Box plot with transparent fill geom_jitter(aes(color = Group), width = 0.2, size = 3, alpha = 0.8) + # Jittered points #geom_hline(yintercept = percentile_1, linetype = "dashed", color = "black", size = 1) + # 1st percentile line #geom_hline(yintercept = percentile_99, linetype = "dashed", color = "black", size = 1) + # 99th percentile line scale_color_manual(values = group_colors) + # Assign custom colors to Group scale_fill_gradient(high = "#D8BFD8", low = "#4B0082", guide = guide_colorbar(title = "CERAD")) + # Purple gradient + labs(title = "APP Metabolism: amyloid-beta clearance Zscore by ceradsc_RADCnonStd", x = "CERAD", y = "Z-score", color = "Group", fill = "CERAD") + theme_minimal(base_size = 16) + theme( text = element_text(size = 24), axis.text.x = element_text(size = 24), axis.text.y = element_text(size = 24), axis.title = element_text(size = 24), legend.position = "right" )

################################################################################ # Step 1: Rename only selected Metadata values, leave others unchanged merged_data_CTADAsymAD$cts_mmse30_cat <- recode(merged_data_CTADAsymAD$cts_mmse30_cat, "0" = "[0-23]", "1" = ">23" ) # Define fill colors using the recoded levels fill_colors <- c("[0-23]" = "#4B0082", ">23" = "#D8BFD8") ggplot(merged_data_CTADAsymAD, aes(x = factor(cts_mmse30_cat), y = AM_ac_4_Zscore, fill = cts_mmse30_cat)) + geom_boxplot(outlier.shape = NA, alpha = 0.7) + geom_jitter(aes(color = Group), width = 0.2, size = 3, alpha = 0.8) + scale_color_manual(values = group_colors) + scale_fill_manual(values = fill_colors, name = "MMSE Category") + labs( title = "APP Metabolism: amyloid-beta clearance Z-score Distribution by MMSE Category", x = "MMSE Category", y = "Z-score", color = "Group" ) + theme_minimal(base_size = 16) + theme( text = element_text(size = 24), axis.text.x = element_text(size = 24), axis.text.y = element_text(size = 24), axis.title = element_text(size = 24), legend.position = "right" )

################################################################################ # Prepare figures for manuscript #ggarrange(P2, P3, P4, P5, #labels = c("B", "C", "D", "E"), #font.label = list(size = 20)) # Adjust size as needed ################################################################################################################################################## ##################################################################################################################################################
Correlation of Clinical Features with Subdomain-specific Zscore (AD and AsymAD all together):
## Cases (AD + AsymAD) # Ensure Sample_ID and batch.channel are aligned (join the data based on Sample_ID/batch.channel) merged_data_cases <- merge(Subdomain_zscore_cases, sample_data_DLPFC_cases, by.x = "Sample_ID", by.y = "batch.channel") dim(merged_data_cases)
## [1] 394 170
# Replace NA values with 0 (or any other specific value) merged_data_cases[is.na(merged_data_cases)] <- 0 write.table(merged_data_cases, "merged_data_cases.txt", sep = "\t", quote = FALSE, row.names = FALSE) ################################################################################################################################################## # Function to compute correlation or Kruskal-Wallis test based on variable type correlation_and_kw_test <- function(Subdomain_col, metadata_col, is_categorical) { if (is_categorical) { # Perform Kruskal-Wallis test for categorical variables test <- kruskal.test(Subdomain_col ~ as.factor(metadata_col)) return(c(statistic = test$statistic, p_value = test$p.value, test_type = "Kruskal-Wallis")) } else { # Perform Pearson correlation for continuous variables test <- cor.test(Subdomain_col, metadata_col, use = "complete.obs") return(c(correlation = test$estimate, p_value = test$p.value, test_type = "Pearson")) } } # Identify categorical and continuous metadata columns categorical_cols <- c(134, 146, 147) # Indices of categorical metadata columns continuous_cols <- c(141) # Indices of continuous metadata columns # Compute correlations or Kruskal-Wallis results for each subdomain-metadata pair correlation_results_cases <- lapply(merged_data_cases[, 2:129], function(Subdomain_col) { # Combine results for categorical and continuous metadata columns results <- list() # Process categorical variables for (cat_col in categorical_cols) { metadata_col <- merged_data_cases[[cat_col]] results[[colnames(merged_data_cases)[cat_col]]] <- correlation_and_kw_test(Subdomain_col, metadata_col, is_categorical = TRUE) } # Process continuous variables for (cont_col in continuous_cols) { metadata_col <- merged_data_cases[[cont_col]] results[[colnames(merged_data_cases)[cont_col]]] <- correlation_and_kw_test(Subdomain_col, metadata_col, is_categorical = FALSE) } return(results) }) # Combine the results into a single data frame correlation_cases <- do.call(rbind, lapply(names(correlation_results_cases), function(Subdomain_name) { subdomain_results <- correlation_results_cases[[Subdomain_name]] data.frame( Subdomain = Subdomain_name, Metadata = names(subdomain_results), do.call(rbind, subdomain_results) ) })) # Apply BY correction correlation_cases$FDR_p_value <- p.adjust(correlation_cases$p_value, method = "fdr") # View results head(correlation_cases)
## Subdomain Metadata ## ApoE4.Dose Ap_amc_2_Zscore ApoE4.Dose ## braaksc Ap_amc_2_Zscore braaksc ## ceradsc_RADCnonStd Ap_amc_2_Zscore ceradsc_RADCnonStd ## cts_mmse30_lv Ap_amc_2_Zscore cts_mmse30_lv ## ApoE4.Dose1 Ap_apiid_3_Zscore ApoE4.Dose ## braaksc1 Ap_apiid_3_Zscore braaksc ## statistic.Kruskal.Wallis.chi.squared p_value ## ApoE4.Dose 3.29625219989816 0.192410128356779 ## braaksc 10.4633140536234 0.015011835317496 ## ceradsc_RADCnonStd 7.61385406281352 0.0222163442967026 ## cts_mmse30_lv 0.173537572611033 0.000540244139297383 ## ApoE4.Dose1 1.97127067753308 0.373202041873252 ## braaksc1 0.426273090117775 0.934763179805741 ## test_type FDR_p_value ## ApoE4.Dose Kruskal-Wallis 0.35823268 ## braaksc Kruskal-Wallis 0.09150071 ## ceradsc_RADCnonStd Kruskal-Wallis 0.11477945 ## cts_mmse30_lv Pearson 0.01627088 ## ApoE4.Dose1 Kruskal-Wallis 0.53977244 ## braaksc1 Kruskal-Wallis 0.95528692
# Write the result to a file write.table(correlation_cases, "correlation_cases_with_FDR_pvalues.txt", sep = "\t", quote = FALSE, row.names = FALSE) ################################################################################################################################################## ## Control + Cases (CT + AD + AsymAD) #head(merged_data_CTADAsymAD) dim(merged_data_CTADAsymAD)
## [1] 508 137
# Function to compute correlation or Kruskal-Wallis test based on variable type correlation_and_kw_test <- function(Subdomain_col, metadata_col, is_categorical) { if (is_categorical) { # Perform Kruskal-Wallis test for categorical variables test <- kruskal.test(Subdomain_col ~ as.factor(metadata_col)) return(c(statistic = test$statistic, p_value = test$p.value, test_type = "Kruskal-Wallis")) } else { # Perform Pearson correlation for continuous variables test <- cor.test(Subdomain_col, metadata_col, use = "complete.obs") return(c(correlation = test$estimate, p_value = test$p.value, test_type = "Pearson")) } } # Identify categorical and continuous metadata columns categorical_cols <- c(130, 131, 132) # Indices of categorical metadata columns continuous_cols <- c(133) # Indices of continuous metadata columns # Compute correlations or Kruskal-Wallis results for each subdomain-metadata pair correlation_results_all <- lapply(merged_data_CTADAsymAD[, 2:129], function(Subdomain_col) { # Combine results for categorical and continuous metadata columns results <- list() # Process categorical variables for (cat_col in categorical_cols) { metadata_col <- merged_data_CTADAsymAD[[cat_col]] results[[colnames(merged_data_CTADAsymAD)[cat_col]]] <- correlation_and_kw_test(Subdomain_col, metadata_col, is_categorical = TRUE) } # Process continuous variables for (cont_col in continuous_cols) { metadata_col <- merged_data_CTADAsymAD[[cont_col]] results[[colnames(merged_data_CTADAsymAD)[cont_col]]] <- correlation_and_kw_test(Subdomain_col, metadata_col, is_categorical = FALSE) } return(results) }) # Combine the results into a single data frame correlation_all <- do.call(rbind, lapply(names(correlation_results_all), function(Subdomain_name) { subdomain_results <- correlation_results_all[[Subdomain_name]] data.frame( Subdomain = Subdomain_name, Metadata = names(subdomain_results), do.call(rbind, subdomain_results) ) })) # Apply BY correction correlation_all$FDR_p_value <- p.adjust(correlation_all$p_value, method = "fdr") # View results head(correlation_all)
## Subdomain Metadata ## ApoE4.Dose Ap_amc_2_Zscore ApoE4.Dose ## braaksc Ap_amc_2_Zscore braaksc ## ceradsc_RADCnonStd Ap_amc_2_Zscore ceradsc_RADCnonStd ## cts_mmse30_lv Ap_amc_2_Zscore cts_mmse30_lv ## ApoE4.Dose1 Ap_apiid_3_Zscore ApoE4.Dose ## braaksc1 Ap_apiid_3_Zscore braaksc ## statistic.Kruskal.Wallis.chi.squared p_value ## ApoE4.Dose 3.64494660168111 0.161625507672932 ## braaksc 14.2469388230688 0.0269961244244145 ## ceradsc_RADCnonStd 13.6952377817797 0.00335075129966468 ## cts_mmse30_lv 0.188405240783301 1.91725642214584e-05 ## ApoE4.Dose1 1.92267619302265 0.382380880972626 ## braaksc1 2.6598001197392 0.850172309683564 ## test_type FDR_p_value ## ApoE4.Dose Kruskal-Wallis 0.3021183508 ## braaksc Kruskal-Wallis 0.0933919980 ## ceradsc_RADCnonStd Kruskal-Wallis 0.0256057413 ## cts_mmse30_lv Pearson 0.0005774325 ## ApoE4.Dose1 Kruskal-Wallis 0.5305664256 ## braaksc1 Kruskal-Wallis 0.9012178521
# Write the result to a file write.table(correlation_all, "correlation_all_with_FDR_pvalues.txt", sep = "\t", quote = FALSE, row.names = FALSE)
Correlation of Clinical Features with Subdomain-specific Zscore (CT, AD and AsymAD Separately):
#Plot the correlation between clinical parameters and individual Subdomain for each group
library(dplyr) library(ggplot2) library(stringr) library(ggnewscale) # Open the merged clinical parameter correlation table for the sigificant subdomain in each catagory (P value < 0.05) data <- read.csv("clinical_parameter_correlation_all_FDR_062325.csv", sep =",", header = TRUE, stringsAsFactors = FALSE) #data <- read.csv("clinical_parameter_correlation_spearman.csv", sep =",", header = TRUE, stringsAsFactors = FALSE) head(data)
## Sno Subdomain_initial unique_id Biodomain ## 1 1 AM_ac_4_Zscore AM_ac_4 APP Metabolism ## 2 2 IR_aoiir_2_Zscore IR_aoiir_2 Immune Response ## 3 3 Ep_others_2_Zscore Ep_others_2 Epigenetic ## 4 4 Ep_Dtfb_1_Zscore Ep_Dtfb_1 Epigenetic ## 5 5 MM_mmo_7_Zscore MM_mmo_7 Mitochondrial Metabolism ## 6 6 Ep_mpgs_4_Zscore Ep_mpgs_4 Epigenetic ## Subdomain ## 1 amyloid-beta clearance ## 2 activation of innate immune response ## 3 Epigenetic_others ## 4 DNA-binding transcription factor binding ## 5 mitochondrial membrane organization ## 6 miRNA-mediated post-transcriptional gene silencing ## Biodomain_Subdomain Metadata ## 1 APP Metabolism_amyloid-beta clearance ApoE4.Dose ## 2 Immune Response_activation of innate immune response ApoE4.Dose ## 3 Epigenetic_others ApoE4.Dose ## 4 Epigenetic_DNA-binding transcription factor binding ApoE4.Dose ## 5 Mitochondrial Metabolism_mitochondrial membrane organization ApoE4.Dose ## 6 Epigenetic_miRNA-mediated post-transcriptional gene silencing ApoE4.Dose ## correlation p_value test_type FDR_p_value nl10_FDR_p_value ## 1 0.2011072 0.000004720 Kruskal-Wallis 0.000197511 3.704408 ## 2 0.1763931 0.000192538 Kruskal-Wallis 0.003757289 2.425125 ## 3 0.1730732 0.000405195 Kruskal-Wallis 0.005762772 2.239369 ## 4 0.1581276 0.001211797 Kruskal-Wallis 0.011931540 1.923303 ## 5 -0.1366433 0.001435520 Kruskal-Wallis 0.013124754 1.881909 ## 6 0.1288321 0.001643482 Kruskal-Wallis 0.014507981 1.838393
# Define the color code for each Biodomain custom_colors <- c("Apoptosis" = "#673399", "APP Metabolism" = "#fe6500", "Autophagy" = "#9931fd", "Cell Cycle" = "#18857f", "DNA Repair" = "#f451ad", "Endolysosome" = "#3466cc", "Epigenetic" = "#cb3233", "Immune Response" = "#9ccdcc", "Lipid Metabolism" = "#989898", "Metal Binding and Homeostasis" = "#4b0d20", "Mitochondrial Metabolism" = "#97cb98", "Myelination" = "#996735", "Oxidative Stress" = "#ffcd66", "Proteostasis" = "#c8b269", "RNA Spliceosome" = "#0c9aff", "Structural Stabilization" = "#ff9a9a", "Synapse" = "#329a33", "Tau Homeostasis" = "#cb97cb", "Vasculature" = "#cecd02", "none" = "#7f7f7f") # Wrap long Biodomain_Subdomain names #data <- data %>% #mutate(Biodomain_Subdomain_wrapped = str_wrap(Biodomain_Subdomain, width = 40)) # Truncate long Biodomain_Subdomain names to 40 characters #data <- data %>% #mutate(Biodomain_Subdomain_short = str_trunc(Biodomain_Subdomain, width = 40)) # Step 1: Rename only selected Metadata values, leave others unchanged data$Metadata <- recode(data$Metadata, "ApoE4.Dose" = "APOE4", "braaksc" = "BRAAK", "ceradsc_RADCnonStd" = "CERAD", "cts_mmse30_lv"= "MMSE" ) # Step 2: Reorder Metadata as factor with desired order data$Metadata <- factor(data$Metadata, levels = c("APOE4", "CERAD", "BRAAK", "MMSE")) # Step 3: Identify Biodomain_Subdomain values that appear in more than one Metadata #common_subdomains <- data %>% #group_by(Biodomain_Subdomain) %>% #summarise(n_metadata = n_distinct(Metadata)) %>% #filter(n_metadata > 1) %>% #pull(Biodomain_Subdomain) # Step 4: Add a flag to the original data #data <- data %>% #mutate(is_common = Biodomain_Subdomain %in% common_subdomains) # Step 5:Order data by Biodomain and Biodomain_Subdomain data <- data %>% arrange(Biodomain, Biodomain_Subdomain) # Step 6:Create factor with correct order data$Biodomain_Subdomain <- factor(data$Biodomain_Subdomain, levels = unique(data$Biodomain_Subdomain)) # Step 7:Determine y positions for rectangles biodomain_bounds <- data %>% distinct(Biodomain, Biodomain_Subdomain) %>% group_by(Biodomain) %>% summarise( ymin = min(as.numeric(Biodomain_Subdomain)) - 0.5, ymax = max(as.numeric(Biodomain_Subdomain)) + 0.5 ) # Plot ggplot(data, aes(x = Metadata, y = Biodomain_Subdomain)) + # Draw background boxes per Biodomain group geom_rect(data = biodomain_bounds, aes(xmin = -Inf, xmax = Inf, ymin = ymin, ymax = ymax, fill = Biodomain), inherit.aes = FALSE, alpha = 0.20) + scale_fill_manual(values = custom_colors, name = "AD Biodomain") + # Reset fill scale for correlation bubble color new_scale_fill() + # Bubble plot layer geom_point(aes(fill = correlation, size = nl10_FDR_p_value), shape = 21, color = "black", alpha = 0.9) + scale_fill_gradient(low = "tomato", high = "seagreen", name = "Correlation") + scale_size(range = c(4, 12), name = "-log10(FDR P-value)") + # Box outline overlay for common points #geom_point( #data = subset(data, is_common), #aes(x = Metadata, y = Biodomain_Subdomain), #shape = 0, # square outline #stroke = 1.5, # Line thickness #color = "black", # Outline color #fill = NA, # Transparent fill to create a "box" effect #size = 8 # Match or slightly larger than base point size #) + # Labels and theme labs( title = "Clinical Parameters Correlation with Subdomain-specific Z score", x = "Clinical Parameters", y = "Biodomain (Subdomain)" ) + theme_minimal(base_size = 18) + theme( text = element_text(size = 14), axis.text.x = element_text(size = 18, hjust = 1, angle = 45), axis.text.y = element_text(size = 14), axis.title = element_text(size = 18), strip.text = element_text(size = 24) )

#write.table(data, "data_unique.txt", sep = "\t", quote = FALSE, row.names = FALSE)