Affinity-Enriched Plasma Proteomics for Biomarker Discovery in Abdominal Aortic Aneurysms
Author
Nicolai B. Palstrøm
Published
November 22, 2024
Introduction
R-code related to the manuscript titled “Affinity-Enriched Plasma Proteomics for Biomarker Discovery in Abdominal Aortic Aneurysms”. Abdominal Aortic Aneurysm (AAA) is a serious medical condition caused by the weakening and expansion of the abdominal aorta. Despite its severity, there are few diagnostic biomarkers available for detecting AAA. In an effort to identify new biomarkers, we conducted a mass-spectrometry-based proteomic analysis on affinity-enriched plasma samples from 45 AAA patients and 45 matched controls. The following R-code describe the process of generating the results presented in the manuscript.
Calculate the percentage of missing data for each variable
Reveal code
missing_percentage <-colMeans(is.na(dataset[6:ncol(dataset)])) *100# Categorize the variables based on missing data percentagecategories <-cut( missing_percentage,breaks =c(-Inf, 1, 25,51, 75, 100, Inf),labels =c("0%", "0-25%", "25-50%", "50-75%", "75-99%", "100% "),right =FALSE)# Count the number of variables in each categorycategory_count <-table(categories)# Convert to a data frame for plottingcategory_df <-as.data.frame(category_count)
Plot missingness
Reveal code
# Create the bar graph with labelslibrary(ggplot2)ggplot(category_df, aes(x = categories, y = Freq)) +geom_bar(stat ="identity", fill ="skyblue") +geom_text(aes(label = Freq), vjust =-0.5) +# Add labels above barslabs( x ="Missingness [%]",y ="Number of Proteins") +theme_minimal()
Encode the vector with the AAA condition as a factor:
# Create separate data frames for case and control samples## Cases = 1 / Controls = 0AAA_caseData <- filtered_data %>%filter(Condition =="Case")AAA_controlData <- filtered_data %>%filter(Condition =="Ctrl")# Get a list of all columns with continuous data:AAA_Columns <-names(filtered_data)[8:ncol(filtered_data)]# Create a new data frame to store the fold change resultsAAA_foldChangeData <-data.frame(Variables =character(0), Fold_Change =numeric(0))# Loop through each protein column and calculate fold changefor (i in AAA_Columns) {# Calculate the mean for case and control AAA_caseMean <- AAA_caseData %>%pull(i) %>%mean(na.rm=T) AAA_controlMean <- AAA_controlData %>%pull(i) %>%mean(na.rm=T)# Calculate fold change between groups AAA_foldChange <- AAA_caseMean / AAA_controlMean# Append the fold change to the results data frame AAA_foldChangeData <-rbind(AAA_foldChangeData, data.frame(Variables = i, Fold_Change = AAA_foldChange))}# Create a new data frame to store the t-test resultsAAA_tTestData <-data.frame(Variables =character(0), P_Value =numeric(0))# Loop through each protein column and calculate p-valuefor (i in AAA_Columns) {# Perform t-testAAA_tTest <-t.test(filtered_data[[i]] ~ filtered_data$Condition, var.equal =TRUE)# Append the p-value to the results data frameAAA_tTestData <-rbind(AAA_tTestData, data.frame(Variables = i, P_Value = AAA_tTest$p.value))}# Create an empty list to store information on missing valuesmissing_list <-list()# Loop through each protein and calculate the degree of missingness for each proteinfor (i in AAA_Columns) {# Calculate the degree of missing valuesmissing_percentage <-mean(is.na(filtered_data[[i]])) *100# Append the values to a new dataframemissing_list[[i]] <- missing_percentage}missing_df <-do.call(rbind, lapply(missing_list, as.data.frame))missing_df <-rownames_to_column(missing_df, var="Variables")colnames(missing_df) <-c("Variables", "Missing_Percentage")# Join the foldChangeData and tTestDataAAA_result_data <-full_join(AAA_foldChangeData, AAA_tTestData, by ="Variables")# Perform multiple testing correctionAAA_result_data$FDR <-p.adjust(AAA_result_data$P_Value, method ="fdr")# Join the result data with missing dataAAA_result_data <-left_join(AAA_result_data, missing_df, by ="Variables")sign_p_value <-sign(log2(AAA_result_data$Fold_Change))*-log10(AAA_result_data$P_Value)AAA_result_data$sign_p <- sign_p_value# Filter out rows that do not pass a 5% pAAA_result_data1 <- AAA_result_data %>%filter(P_Value <=5E-2)AAA_result_data_fdr <- AAA_result_data %>%filter(FDR <=5E-2)AAA_FDRSignificantVariables <- AAA_result_data_fdr$Variables# Print table sorted by p-values from lowest to highesthead(AAA_result_data1[order(AAA_result_data1$P_Value),],24)
#Remove variables with more than 50% missingnessfilter_df_50 <- filtered_data %>%select(where(~sum(is.na(.x))/length(.x)<0.5))#Create transposed dataset - see impute documentation for reasondata_temp <-t(filter_df_50[,8:ncol(filter_df_50)])#Perform imputation using kNNimputed_data <-impute.knn(as.matrix(data_temp),rng.seed=1)#Get left-out variables transferred to a new datasetfilter_imp_df <- filter_df_50#Transfer imputed data to the new datasetfilter_imp_df[8:ncol(filter_imp_df)] <-as.data.frame(t(imputed_data$data))#Rename datasetAAA_df <- filter_imp_df
# Load required librarieslibrary(ggplot2)library(dplyr)# Sample data frame (replace with your actual data)data <- readxl::read_excel(path =here("GO_Biological_Up.xlsx"))# Data processingdata <- data %>%arrange(Strength) %>%# Sort by 'Strength'mutate(log_FDR =-log10(FDR)) # Log10 transform the 'FDR' variable
Generate plot
Reveal code
# Create the lollipop chartplot_lollipop_up <-ggplot(data, aes(y =reorder(Pathway, Strength), x = Strength)) +# Add line (lollipop stick)geom_col(aes(fill = log_FDR), width =0.1, show.legend =FALSE)+# Add points (lollipop head), colored by log_FDR and sized by Observed_Genesgeom_point(aes(size = Observed_Genes, color = log_FDR)) +# Customize the color gradient (blue for low FDR, red for high FDR)scale_color_gradient(low ="blue", high ="red", name ="-log10(FDR)") +scale_fill_gradient(low ="blue", high ="red") +# Customize the size scale for the pointsscale_size_continuous(name ="Observed Genes", breaks =c(10,20,40,60) ,range=c(2,3.5)) +# Labels and theme adjustmentslabs(x ="Fold enrichment",y="") +theme_classic() +# Clean themetheme(axis.text.y =element_text(angle =0, hjust =1)) # Keep y-axis labels horizontalplot_lollipop_up
# Load required librarieslibrary(ggplot2)library(dplyr)# Sample data frame (replace with your actual data)data <- readxl::read_excel(path =here("GO_Biological_Down.xlsx"))# Data processingdata <- data %>%arrange(Strength) %>%# Sort by 'Strength'mutate(log_FDR =-log10(FDR)) # Log10 transform the 'FDR' variable
Generate plot
Reveal code
# Create the lollipop chartplot_lollipop_down <-ggplot(data, aes(y =reorder(Pathway, Strength), x = Strength)) +# Add line (lollipop stick)geom_col(aes(fill = log_FDR), width =0.1, show.legend =FALSE)+# Add points (lollipop head), colored by log_FDR and sized by Observed_Genesgeom_point(aes(size = Observed_Genes, color = log_FDR)) +# Customize the color gradient (blue for low FDR, red for high FDR)scale_color_gradient(low ="blue", high ="red", name ="-log10(FDR)") +scale_fill_gradient(low ="blue", high ="red") +# Customize the size scale for the pointsscale_size_continuous(name ="Observed Genes", breaks =c(5,10,20,30) ,range=c(2,3.5)) +# Labels and theme adjustmentslabs(x ="Fold enrichment",y="") +theme_classic() +# Clean themetheme(axis.text.y =element_text(angle =0, hjust =1)) # Keep y-axis labels horizontalplot_lollipop_down
#Define parameterscontrol <-rfeControl(functions=rfFuncs, method="boot")#Define predictors and the response variablepredictors <- AAA_df[regulated_proteins_names]response <- AAA_df$Condition# Run the RFE algorithmrfe_result <-rfe(predictors, response, sizes=c(5:20), rfeControl=control)# Select top 10 variables#paste(rfe_result$optVariables[1:10], collapse="+")
Train model based on selected proteins
Reveal code
# Train the final model with all variablesset.seed(1)# Set up cross-validationtrain_control <-trainControl(method ="repeatedcv", repeats =5,number =10,classProbs =TRUE, summaryFunction = twoClassSummary,savePredictions =TRUE)#Train model with fixed hyperparametersfinal_model <-train(Condition ~ P02787+Q96KN2+P80108+P08254+Q9NQZ6+P60903+P19021+Q15517+Q9UM07+P10916,data = AAA_df[,c("Condition",regulated_proteins_names)], method ="rf", trControl = train_control,tuneGrid =data.frame(.mtry =2) )# Select a parameter settingselectedIndices <- final_model$pred$mtry ==2pROC::auc(roc(final_model$pred$obs[selectedIndices], final_model$pred$Case[selectedIndices]))
# Extract predictions for the final modelfinal_preds <- final_model$pred# Compute the average ROC for the final modelfinal_roc <-roc(final_preds$obs, final_preds$Case)# Extract the confusion matrixfinal_model[["finalModel"]][["confusion"]]
Ctrl Case class.error
Ctrl 40 5 0.1111111
Case 6 39 0.1333333
Plot ROC curve for individual proteins
Reveal code
# List of variablesvariables <-c("P02787", "Q96KN2", "P80108", "P08254", "Q9NQZ6", "P60903", "P19021", "Q15517", "Q9UM07", "P10916")# Initialize an empty list to store ROC curvesroc_list <-list()# Loop over each variable to train individual models and calculate ROC curvesfor (var in variables) { formula <-as.formula(paste("Condition ~", var))# Train model for each individual variableset.seed(123) model <-train(formula, data = AAA_df[,c("Condition",regulated_proteins_names)], method ="rf",metric ="ROC", trControl = train_control,tuneGrid =data.frame(.mtry =1))# Extract predictions preds <- model$pred# Compute ROC curve roc_curve <-roc(preds$obs, preds$Case)# Store ROC curve roc_list[[var]] <- roc_curve}# Plot individual ROC curvesplot(roc_list[[1]], col =rgb(0.2, 0.2, 1, alpha =0.3), lwd =2, main ="ROC Curves for Individual Variables and Final Model")for (i in2:length(roc_list)) {plot(roc_list[[i]], col =rgb(0.2, 0.2, 1, alpha =0.3), lwd =2, add =TRUE)}# Plot the final model ROC curve in redplot(final_roc, col ="red", lwd =2, add =TRUE)# Add a legendlegend("bottomright", legend =c("Final Model", "Individual Proteins"), col =c("red", rgb(0.2, 0.2, 1, alpha =0.3)), lwd =2)
Correlation of proteins with aorta diameter
After suggestions from one of the reviewers, proteins were examined for their correlation with aorta diameter.
Reveal code
# Calculate correlations between aorta diameter and proteins# Function to compute correlation and p-valuecor_with_p <-function(x, y) { cor_test <-cor.test(x, y, use ="complete.obs")list(correlation = cor_test$estimate, p_value = cor_test$p.value)}# Apply the function across columns 8 to 761 i.e. the protein dataresults <-lapply(8:761, function(col) {cor_with_p(AAA_df[[col]], AAA_df[[7]])})# Extract correlations and p-values into separate vectorscorrelations <-sapply(results, function(res) res$correlation)p_values <-sapply(results, function(res) res$p_value)# Add protein namesnames(correlations) <-names(AAA_df)[8:761]names(p_values) <-names(AAA_df)[8:761]# Combine into a data frame for easy viewingcor_results <-data.frame(Variable =names(AAA_df)[8:761],Correlation = correlations,P_Value = p_values)# Sort by correlation (absolute value) or p-valuesorted_results <- cor_results[order(abs(cor_results$Correlation), decreasing =TRUE), ]significant_results <-subset(cor_results, P_Value <0.05)
Create bar plot showing proteins with >|0.35| correlations
Reveal code
filtered_cor <-subset(significant_results, Correlation >0.35| Correlation <-0.35)cor_bar_plot <-ggplot(filtered_cor, aes(x =reorder(Variable, Correlation), y = Correlation, fill = Correlation >0)) +geom_bar(stat ="identity", color ="black") +scale_fill_manual(values =c("FALSE"="blue", "TRUE"="red"), labels =c("Negative", "Positive")) +coord_flip() +labs(x ="UniProt Acc. Nr.", y ="Correlation") +theme_minimal()
# Join patients present in proteomic data onlyAAA_clin_prot <-left_join(AAA_df,Patient_info,by="SampleID")# Calculate the difference between the outer and inner aortic areaAAA_clin_prot$ILT_area <- AAA_clin_prot$OUTERARE-AAA_clin_prot$INNERARE# If the difference is larger than 0, then there is an ILTAAA_clin_prot$ILT_status <-ifelse(is.na(AAA_clin_prot$ILT_area), 0, ifelse(AAA_clin_prot$ILT_area >0, 1, 0))
Subgroup analyses
Cases - ILT vs non-ILT
Reveal code
# Subset the dataframe where Condition == "Case"df_case <- AAA_clin_prot[AAA_clin_prot$Condition =="Case", ]# Initialize an empty dataframe to store resultsresults_condition_cases <-data.frame(Variables =character(),Fold_Change =numeric(),P_Value =numeric(),stringsAsFactors =FALSE)# Loop through variables 8:761for (i in8:761) { variable_name <-colnames(AAA_clin_prot)[i]# Perform t-test t_test <-t.test(df_case[[i]] ~as.factor(df_case$ILT_status), na.action = na.omit)# Calculate fold-change group_means <-tapply(df_case[[i]], as.factor(df_case$ILT_status), mean, na.rm =TRUE)if (length(group_means) ==2) { fold_change <- group_means[2] / group_means[1] # Assuming levels are ordered properly } else { fold_change <-NA# Handle cases where one group has no data }# Append results to the dataframe results_condition_cases <-rbind( results_condition_cases,data.frame(Variables = variable_name,Fold_Change = fold_change,P_Value = t_test$p.value,stringsAsFactors =FALSE ) )}#writexl::write_xlsx(results_condition_cases,path=here("export","T_tests_only_cases.xlsx"))
Controls vs ILT
Reveal code
# Subset the dataframe where ILT_status != 0index_non_zero_ILT <- AAA_clin_prot$ILT_status ==1| AAA_clin_prot$Condition =="Ctrl"# Use the index to subset the dataframedf_non_zero_ILT <- AAA_clin_prot[index_non_zero_ILT, ]summary(df_non_zero_ILT$Condition)
Ctrl Case
45 35
Reveal code
# Initialize an empty dataframe to store resultsresults_condition <-data.frame(Variables =character(),Fold_Change =numeric(),P_Value =numeric(),stringsAsFactors =FALSE)# Loop through variables 8:761for (i in8:761) { variable_name <-colnames(AAA_clin_prot)[i]# Perform t-test t_test <-t.test(df_non_zero_ILT[[i]] ~ df_non_zero_ILT$Condition, na.action = na.omit)# Calculate fold-change group_means <-tapply(df_non_zero_ILT[[i]], df_non_zero_ILT$Condition, mean, na.rm =TRUE)if (length(group_means) ==2) { fold_change <- group_means[2] / group_means[1] # Assuming levels are ordered properly } else { fold_change <-NA# Handle cases where one group has no data }# Append results to the dataframe results_condition <-rbind( results_condition,data.frame(Variables = variable_name,Fold_Change = fold_change,P_Value = t_test$p.value,stringsAsFactors =FALSE ) )}#writexl::write_xlsx(results_condition,path=here("export","T_tests_w_ILT.xlsx"))
Controls vs non-ILT
Reveal code
# Subset the dataframe where ILT_status = 0index_zero_ILT <- AAA_clin_prot$ILT_status ==0| AAA_clin_prot$Condition =="Ctrl"# Use the index to subset the dataframedf_zero_ILT <- AAA_clin_prot[index_zero_ILT, ]summary(df_zero_ILT$Condition)
Ctrl Case
45 10
Reveal code
# Initialize an empty dataframe to store resultsresults_condition_w_zero <-data.frame(Variables =character(),Fold_Change =numeric(),P_Value =numeric(),stringsAsFactors =FALSE)# Loop through variables 8:761for (i in8:761) { variable_name <-colnames(AAA_clin_prot)[i]# Perform t-test t_test <-t.test(df_zero_ILT[[i]] ~ df_zero_ILT$Condition, na.action = na.omit)# Calculate fold-change group_means <-tapply(df_zero_ILT[[i]], df_zero_ILT$Condition, mean, na.rm =TRUE)if (length(group_means) ==2) { fold_change <- group_means[2] / group_means[1] # Assuming levels are ordered properly } else { fold_change <-NA# Handle cases where one group has no data }# Append results to the dataframe results_condition_w_zero <-rbind( results_condition_w_zero,data.frame(Variables = variable_name,Fold_Change = fold_change,P_Value = t_test$p.value,stringsAsFactors =FALSE ) )}#writexl::write_xlsx(results_condition_w_zero,path=here("export","T_tests_wo_ILT.xlsx"))
Combine into a single table with original data
Reveal code
# Define the proposed protein biomarkers:proteins <-c("P02787", "Q96KN2", "P80108", "P08254", "Q9NQZ6", "P60903", "P19021", "Q15517", "Q9UM07", "P10916")# Define the dataframe names to becombineddataframes <-list(Original = AAA_result_data,ILT_neg_vs_ILT_pos = results_condition_cases,Ctrl_vs_non_ILT = results_condition_w_zero,Ctrl_vs_ILT = results_condition)# Initialize an empty list to store the resultsresults_list <-list()# Loop through the dataframes and extract the required informationfor (df_name innames(dataframes)) { df <- dataframes[[df_name]]# Filter the dataframe to keep only the rows corresponding to the proteins of interest filtered_df <- df[df$Variable %in% proteins, ]# Add a column to identify the source dataframe filtered_df$Source <- df_name# Select and reorder columns (Variable, Fold_Change, P_Value, Source) filtered_df <- filtered_df[, c("Variables", "Fold_Change", "P_Value", "Source")]# Append the filtered dataframe to the results list results_list[[df_name]] <- filtered_df}# Combine all filtered dataframes into onefinal_results <-do.call(rbind, results_list)rownames(final_results) <-NULL# Print the final combined dataframeprint(final_results)
# Train the final model with all variablesset.seed(1)# Set up cross-validationtrain_control <-trainControl(method ="repeatedcv", repeats =5,number =10,classProbs =TRUE, summaryFunction = twoClassSummary,savePredictions =TRUE)#Train model with fixed hyperparametersfinal_model_w_ITL <-train(Condition ~ P02787+Q96KN2+P80108+P08254+Q9NQZ6+P60903+P19021+Q15517+Q9UM07+P10916+ILT_status,data = AAA_clin_prot[c("Condition",regulated_proteins_names,"ILT_status")], method ="rf", trControl = train_control,tuneGrid =data.frame(.mtry =2) )# Select a parameter settingselectedIndices_w_ITL <- final_model_w_ITL$pred$mtry ==2pROC::auc(roc(final_model_w_ITL$pred$obs[selectedIndices_w_ITL], final_model_w_ITL$pred$Case[selectedIndices_w_ITL]))
# Extract predictions for the final modelfinal_preds_w_ITL <- final_model_w_ITL$pred# Compute the average ROC for the final modelfinal_roc_w_ITL <-roc(final_preds_w_ITL$obs, final_preds_w_ITL$Case)# Extract the confusion matrixfinal_model_w_ITL[["finalModel"]][["confusion"]]
Ctrl Case class.error
Ctrl 43 2 0.04444444
Case 4 41 0.08888889
Original model without ILT patients
Reveal code
# Train the final model with all variablesset.seed(1)# Set up cross-validationtrain_control <-trainControl(method ="repeatedcv", repeats =5,number =10,classProbs =TRUE, summaryFunction = twoClassSummary,savePredictions =TRUE)#Train model with fixed hyperparametersfinal_model_wo_ITL <-train(Condition ~ P02787+Q96KN2+P80108+P08254+Q9NQZ6+P60903+P19021+Q15517+Q9UM07+P10916,data = AAA_clin_prot[AAA_clin_prot$ILT_status==0,c("Condition",regulated_proteins_names)], method ="rf", trControl = train_control,tuneGrid =data.frame(.mtry =2) )# Select a parameter settingselectedIndices_wo_ITL <- final_model_wo_ITL$pred$mtry ==2pROC::auc(roc(final_model_wo_ITL$pred$obs[selectedIndices_wo_ITL], final_model_wo_ITL$pred$Case[selectedIndices_wo_ITL]))
# Extract predictions for the final modelfinal_preds_wo_ITL <- final_model_wo_ITL$pred# Compute the average ROC for the final modelfinal_roc_wo_ITL <-roc(final_preds_wo_ITL$obs, final_preds_wo_ITL$Case)# Extract the confusion matrixfinal_model_wo_ITL[["finalModel"]][["confusion"]]
Ctrl Case class.error
Ctrl 43 2 0.04444444
Case 6 4 0.60000000