Affinity-Enriched Plasma Proteomics for Biomarker Discovery in Abdominal Aortic Aneurysms

Author

Nicolai B. Palstrøm

Published

November 22, 2024

Introduction

R-code related to the manuscript titled “Affinity-Enriched Plasma Proteomics for Biomarker Discovery in Abdominal Aortic Aneurysms”. Abdominal Aortic Aneurysm (AAA) is a serious medical condition caused by the weakening and expansion of the abdominal aorta. Despite its severity, there are few diagnostic biomarkers available for detecting AAA. In an effort to identify new biomarkers, we conducted a mass-spectrometry-based proteomic analysis on affinity-enriched plasma samples from 45 AAA patients and 45 matched controls. The following R-code describe the process of generating the results presented in the manuscript.

Preparation

Load packages

Reveal code
if(!require(tidyverse)){
    install.packages("tidyverse")
    library(tidyverse)
}

if(!require(ggthemes)){
    install.packages("ggthemes")
    library(ggthemes)
}

if(!require(ggplot2)){
    install.packages("ggplot2")
    library(ggplot2)
}

if(!require(ggpubr)){
    install.packages("ggpubr")
    library(ggpubr)
}
if(!require(here)){
    install.packages("here")
    library(here)
}
if(!require(impute)){
  if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
    BiocManager::install("impute")
    library(impute)
}
if(!require(caret)){
    install.packages("caret")
    library(caret)
}
if(!require(mlbench)){
    install.packages("mlbench")
    library(mlbench)
}
if(!require(pROC)){
    install.packages("pROC")
    library(pROC)
}

Import data

Reveal code
dataset <- readxl::read_excel("data/Datasets.xlsx",sheet=1)

Data examination

Calculate the percentage of missing data for each variable

Reveal code
missing_percentage <- colMeans(is.na(dataset[6:ncol(dataset)])) * 100

# Categorize the variables based on missing data percentage
categories <- cut(
  missing_percentage,
  breaks = c(-Inf, 1, 25,51, 75, 100, Inf),
  labels = c("0%", "0-25%", "25-50%", "50-75%", "75-99%", "100% "),
  right = FALSE
)

# Count the number of variables in each category
category_count <- table(categories)

# Convert to a data frame for plotting
category_df <- as.data.frame(category_count)

Plot missingness

Reveal code
# Create the bar graph with labels
library(ggplot2)
ggplot(category_df, aes(x = categories, y = Freq)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  geom_text(aes(label = Freq), vjust = -0.5) +  # Add labels above bars
  labs(       x = "Missingness [%]",
       y = "Number of Proteins") +
  theme_minimal()

Encode the vector with the AAA condition as a factor:

Reveal code
dataset$Condition <- 
  factor(dataset$Condition,
         levels = c(0,1),
         labels=c("Ctrl", # Reference
                  "Case")
         ,ordered = TRUE)

str(dataset$Condition)
 Ord.factor w/ 2 levels "Ctrl"<"Case": 2 2 2 2 2 2 2 1 1 1 ...

Remove non-quantified proteins

Reveal code
filtered_data <- dataset %>% select_if(~ !all(is.na(.)))

Calculate protein regulations and p-values:

Reveal code
# Create separate data frames for case and control samples
## Cases = 1 / Controls = 0
AAA_caseData <- filtered_data %>% filter(Condition == "Case")
AAA_controlData <- filtered_data %>% filter(Condition == "Ctrl")

# Get a list of all columns with continuous data:
AAA_Columns <- names(filtered_data)[8:ncol(filtered_data)]

# Create a new data frame to store the fold change results
AAA_foldChangeData <- data.frame(Variables = character(0), Fold_Change = numeric(0))

# Loop through each protein column and calculate fold change
for (i in AAA_Columns) {
  
  # Calculate the mean for case and control
  AAA_caseMean <- AAA_caseData %>% pull(i) %>% mean(na.rm=T)
  AAA_controlMean <- AAA_controlData %>% pull(i) %>% mean(na.rm=T)
  
  # Calculate fold change between groups
  AAA_foldChange <- AAA_caseMean / AAA_controlMean
  
  # Append the fold change to the results data frame
  AAA_foldChangeData <- rbind(AAA_foldChangeData, data.frame(Variables = i, Fold_Change = AAA_foldChange))
}

# Create a new data frame to store the t-test results

AAA_tTestData <- data.frame(Variables = character(0), P_Value = numeric(0))

# Loop through each protein column and calculate p-value
for (i in AAA_Columns) {
  
# Perform t-test
AAA_tTest <- t.test(filtered_data[[i]] ~ filtered_data$Condition, var.equal = TRUE)

# Append the p-value to the results data frame
AAA_tTestData <- rbind(AAA_tTestData, data.frame(Variables = i, P_Value = AAA_tTest$p.value))
}

# Create an empty list to store information on missing values
missing_list <- list()

# Loop through each protein and calculate the degree of missingness for each protein
for (i in AAA_Columns) {
  
# Calculate the degree of missing values
missing_percentage <- mean(is.na(filtered_data[[i]])) * 100

# Append the values to a new dataframe
missing_list[[i]] <- missing_percentage
}

missing_df <- do.call(rbind, lapply(missing_list, as.data.frame))

missing_df <- rownames_to_column(missing_df, var="Variables")
colnames(missing_df) <- c("Variables", "Missing_Percentage")

# Join the foldChangeData and tTestData
AAA_result_data <- full_join(AAA_foldChangeData, AAA_tTestData, by = "Variables")

# Perform multiple testing correction
AAA_result_data$FDR <- p.adjust(AAA_result_data$P_Value, method = "fdr")

# Join the result data with missing data
AAA_result_data <- left_join(AAA_result_data, missing_df, by = "Variables")

sign_p_value <- sign(log2(AAA_result_data$Fold_Change))*-log10(AAA_result_data$P_Value)

AAA_result_data$sign_p <- sign_p_value

# Filter out rows that do not pass a 5% p
AAA_result_data1 <- AAA_result_data %>% 
  filter(P_Value <= 5E-2)

AAA_result_data_fdr <- AAA_result_data %>% 
  filter(FDR <= 5E-2)
AAA_FDRSignificantVariables <- AAA_result_data_fdr$Variables

# Print table sorted by p-values from lowest to highest
head(AAA_result_data1[order(AAA_result_data1$P_Value),],24)
       Variables Fold_Change      P_Value          FDR Missing_Percentage
1   AAA_diameter   2.5554734 9.147812e-28 1.473712e-24            0.00000
4         Q9NQZ6   2.2456146 4.102711e-06 3.304733e-03            0.00000
31        Q8IU81   2.7729788 1.631972e-05 8.763689e-03            0.00000
135       Q96KN2   0.8715908 5.870251e-05 2.334648e-02            0.00000
67        P02787   1.1622664 8.372503e-05 2.334648e-02            0.00000
194       P03950   0.8429115 1.006391e-04 2.334648e-02            0.00000
92        P01023   1.0986888 1.036751e-04 2.334648e-02            0.00000
129       P07357   1.2538405 1.159353e-04 2.334648e-02            0.00000
162       P60903   0.7925234 1.906707e-04 3.413006e-02           16.66667
291       Q92918   2.2825804 2.793530e-04 4.050796e-02           66.66667
28        P05164   1.7320140 3.028050e-04 4.050796e-02            0.00000
158       P60985   0.7379606 3.190358e-04 4.050796e-02            0.00000
262       P20594   1.4028144 3.268799e-04 4.050796e-02           50.00000
103       P08709   1.3768075 3.613198e-04 4.157759e-02            0.00000
52        P06310   1.1959227 4.524920e-04 4.767907e-02            0.00000
87        P02746   1.2123611 5.780156e-04 4.767907e-02            0.00000
21        P11388   1.3976464 5.854906e-04 4.767907e-02            0.00000
38        P03956   1.3059687 6.014768e-04 4.767907e-02            0.00000
226       P38646   2.3172310 6.063096e-04 4.767907e-02           83.33333
156       P04439   1.2828659 6.476634e-04 4.767907e-02            0.00000
182       P22352   0.8181053 6.508373e-04 4.767907e-02            0.00000
84        P14174   1.3521004 6.879986e-04 4.767907e-02            0.00000
98        P08603   1.0845756 7.120167e-04 4.767907e-02            0.00000
181       P80108   0.9043580 7.252563e-04 4.767907e-02            0.00000
       sign_p
1   27.038683
4    5.386929
31   4.787287
135 -4.231343
67   4.077145
194 -3.997233
92   3.984325
129  3.935784
162 -3.719716
291  3.553847
28   3.518837
158 -3.496161
262  3.485612
103  3.442108
52   3.344389
87   3.238060
21   3.232480
38   3.220781
226  3.217306
156  3.188651
182 -3.186528
84   3.162412
98   3.147510
181 -3.139508
Reveal code
#Create lists of regulated proteins
up_reg <- AAA_result_data1 %>% filter(Fold_Change > 1 & P_Value < 0.05)

up_reg_names <- up_reg$Variables

down_reg <- AAA_result_data1 %>% filter(Fold_Change < 1 & P_Value < 0.05)

down_reg_names <- down_reg$Variables

regulated_proteins <- AAA_result_data1 %>% filter(P_Value < 0.05 & Missing_Percentage < 49)

regulated_proteins_names <- regulated_proteins$Variables

Export list of signed p-values:

Reveal code
writexl::write_xlsx(AAA_result_data,path=here("export","T_tests.xlsx"))

Create volcano plot:

Reveal code
# Create a volcano plot
volcano_plot <- ggplot(AAA_result_data, aes(x = log2(Fold_Change), y = -log10(P_Value))) +
  geom_point(alpha = 0.4, size = 1.5) +
  theme_minimal() +
  labs(
    x = "Log2 Fold Change",
    y = "-Log10 P-value"
  ) +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black")+
  geom_hline(yintercept = -log10(7.4E-4), linetype = "dashed", color = "red")+xlim(-3,3)+ylim(0,6)+geom_vline(xintercept = log2(1.2))+geom_vline(xintercept = log2(0.83))

volcano_plot

Save plot

Reveal code
#ggsave("Volcano_plot.pdf",plot=volcano_plot,device = "pdf",path = here("plots"),width = 7,height = 7)

Perform imputation of missing values:

Reveal code
#Remove variables with more than 50% missingness
filter_df_50 <- filtered_data %>%
  select(where(~sum(is.na(.x))/length(.x)<0.5))
#Create transposed dataset - see impute documentation for reason
data_temp <- t(filter_df_50[,8:ncol(filter_df_50)])
#Perform imputation using kNN
imputed_data <- impute.knn(as.matrix(data_temp),rng.seed=1)
#Get left-out variables transferred to a new dataset
filter_imp_df <- filter_df_50
#Transfer imputed data to the new dataset
filter_imp_df[8:ncol(filter_imp_df)] <- as.data.frame(t(imputed_data$data))

#Rename dataset
AAA_df <- filter_imp_df

Check for negative values

Reveal code
any(filter_imp_df[8:ncol(filter_imp_df)] < 0,na.rm = T)
[1] FALSE

Generate plots from the GO analysis

Up-regulated:

Load data from the GO analysis:

Reveal code
# Load required libraries
library(ggplot2)
library(dplyr)

# Sample data frame (replace with your actual data)
data <- readxl::read_excel(path = here("GO_Biological_Up.xlsx"))

# Data processing
data <- data %>%
  arrange(Strength) %>%                         # Sort by 'Strength'
  mutate(log_FDR = -log10(FDR))                 # Log10 transform the 'FDR' variable

Generate plot

Reveal code
# Create the lollipop chart
plot_lollipop_up <- ggplot(data, aes(y = reorder(Pathway, Strength), x = Strength)) +
  
  # Add line (lollipop stick)
  geom_col(aes(fill = log_FDR), width = 0.1, show.legend = FALSE)+
  
  # Add points (lollipop head), colored by log_FDR and sized by Observed_Genes
  geom_point(aes(size = Observed_Genes, color = log_FDR)) +
  
  # Customize the color gradient (blue for low FDR, red for high FDR)
  scale_color_gradient(low = "blue", high = "red", 
                       name = "-log10(FDR)") +
  scale_fill_gradient(low = "blue", high = "red") +
  # Customize the size scale for the points
  scale_size_continuous(name = "Observed Genes", 
                        breaks = c(10,20,40,60) ,
                        range=c(2,3.5)) +
  
  # Labels and theme adjustments
  labs(x = "Fold enrichment",y="") +
  
  theme_classic() +                          # Clean theme
  theme(axis.text.y = element_text(angle = 0, hjust = 1))  # Keep y-axis labels horizontal
plot_lollipop_up

Save plot

Reveal code
ggsave("Lollipop_Up_regulated.pdf",plot=plot_lollipop_up,device = "pdf",path = here("plots"),width = 7,height = 5.3)

Down-regulated:

Load data from the Go analysis:

Reveal code
# Load required libraries
library(ggplot2)
library(dplyr)

# Sample data frame (replace with your actual data)
data <- readxl::read_excel(path = here("GO_Biological_Down.xlsx"))

# Data processing
data <- data %>%
  arrange(Strength) %>%                         # Sort by 'Strength'
  mutate(log_FDR = -log10(FDR))                 # Log10 transform the 'FDR' variable

Generate plot

Reveal code
# Create the lollipop chart
plot_lollipop_down <- ggplot(data, aes(y = reorder(Pathway, Strength), x = Strength)) +
  
  # Add line (lollipop stick)
  geom_col(aes(fill = log_FDR), width = 0.1, show.legend = FALSE)+
  
  # Add points (lollipop head), colored by log_FDR and sized by Observed_Genes
  geom_point(aes(size = Observed_Genes, color = log_FDR)) +
  
  # Customize the color gradient (blue for low FDR, red for high FDR)
  scale_color_gradient(low = "blue", high = "red", 
                       name = "-log10(FDR)") +
  scale_fill_gradient(low = "blue", high = "red") +
  # Customize the size scale for the points
  scale_size_continuous(name = "Observed Genes", 
                        breaks = c(5,10,20,30) ,
                        range=c(2,3.5)) +
  
  # Labels and theme adjustments
  labs(x = "Fold enrichment",y="") +
  
  theme_classic() +                          # Clean theme
  theme(axis.text.y = element_text(angle = 0, hjust = 1))  # Keep y-axis labels horizontal
plot_lollipop_down

Save plot

Reveal code
ggsave("Lollipop_Down_regulated.pdf",plot=plot_lollipop_down,device = "pdf",path = here("plots"),width = 7,height = 5.3)

Classification model training and evaluation:

Perform recursive feature elimination (RFE):

Reveal code
#Define parameters
control <- rfeControl(functions=rfFuncs, method="boot")

#Define predictors and the response variable
predictors <- AAA_df[regulated_proteins_names]
response <- AAA_df$Condition

# Run the RFE algorithm
rfe_result <- rfe(predictors, response, sizes=c(5:20), rfeControl=control)

# Select top 10 variables
#paste(rfe_result$optVariables[1:10], collapse="+")

Train model based on selected proteins

Reveal code
# Train the final model with all variables
set.seed(1)

# Set up cross-validation
train_control <- trainControl(
  method = "repeatedcv", 
  repeats = 5,
  number = 10,
  classProbs = TRUE, 
  summaryFunction = twoClassSummary,
  savePredictions = TRUE
)

#Train model with fixed hyperparameters
final_model <- train(Condition ~ P02787+Q96KN2+P80108+P08254+Q9NQZ6+P60903+P19021+Q15517+Q9UM07+P10916,
                     data = AAA_df[,c("Condition",regulated_proteins_names)], 
                     method = "rf", 
                     trControl = train_control,
                     tuneGrid = data.frame(.mtry = 2)
                     )

# Select a parameter setting
selectedIndices <- final_model$pred$mtry == 2
pROC::auc(roc(final_model$pred$obs[selectedIndices],
         final_model$pred$Case[selectedIndices]))
Area under the curve: 0.9302
Reveal code
pROC::ci.auc(roc(final_model$pred$obs[selectedIndices],
         final_model$pred$Case[selectedIndices]))
95% CI: 0.9073-0.953 (DeLong)
Reveal code
# Extract predictions for the final model
final_preds <- final_model$pred

# Compute the average ROC for the final model
final_roc <- roc(final_preds$obs, final_preds$Case)

# Extract the confusion matrix
final_model[["finalModel"]][["confusion"]]
     Ctrl Case class.error
Ctrl   40    5   0.1111111
Case    6   39   0.1333333

Plot ROC curve for individual proteins

Reveal code
# List of variables
variables <- c("P02787", "Q96KN2", "P80108", "P08254", "Q9NQZ6", 
               "P60903", "P19021", "Q15517", "Q9UM07", "P10916")

# Initialize an empty list to store ROC curves
roc_list <- list()

# Loop over each variable to train individual models and calculate ROC curves
for (var in variables) {
  formula <- as.formula(paste("Condition ~", var))
  
  # Train model for each individual variable
  set.seed(123)
  model <- train(formula, data = AAA_df[,c("Condition",regulated_proteins_names)], method = "rf",
                 metric = "ROC", trControl = train_control,tuneGrid = data.frame(.mtry = 1))
  
  # Extract predictions
  preds <- model$pred
  
  # Compute ROC curve
  roc_curve <- roc(preds$obs, preds$Case)
  
  # Store ROC curve
  roc_list[[var]] <- roc_curve
}

# Plot individual ROC curves
plot(roc_list[[1]], col = rgb(0.2, 0.2, 1, alpha = 0.3), lwd = 2, main = "ROC Curves for Individual Variables and Final Model")

for (i in 2:length(roc_list)) {
  plot(roc_list[[i]], col = rgb(0.2, 0.2, 1, alpha = 0.3), lwd = 2, add = TRUE)
}

# Plot the final model ROC curve in red
plot(final_roc, col = "red", lwd = 2, add = TRUE)

# Add a legend
legend("bottomright", legend = c("Final Model", "Individual Proteins"), 
       col = c("red", rgb(0.2, 0.2, 1, alpha = 0.3)), lwd = 2)

Correlation of proteins with aorta diameter

After suggestions from one of the reviewers, proteins were examined for their correlation with aorta diameter.

Reveal code
# Calculate correlations between aorta diameter and proteins

# Function to compute correlation and p-value
cor_with_p <- function(x, y) {
  cor_test <- cor.test(x, y, use = "complete.obs")
  list(correlation = cor_test$estimate, p_value = cor_test$p.value)
}

# Apply the function across columns 8 to 761 i.e. the protein data
results <- lapply(8:761, function(col) {
  cor_with_p(AAA_df[[col]], AAA_df[[7]])
})

# Extract correlations and p-values into separate vectors
correlations <- sapply(results, function(res) res$correlation)
p_values <- sapply(results, function(res) res$p_value)

# Add protein names
names(correlations) <- names(AAA_df)[8:761]
names(p_values) <- names(AAA_df)[8:761]

# Combine into a data frame for easy viewing
cor_results <- data.frame(
  Variable = names(AAA_df)[8:761],
  Correlation = correlations,
  P_Value = p_values
)

# Sort by correlation (absolute value) or p-value
sorted_results <- cor_results[order(abs(cor_results$Correlation), decreasing = TRUE), ]
significant_results <- subset(cor_results, P_Value < 0.05)

Create bar plot showing proteins with >|0.35| correlations

Reveal code
filtered_cor <- subset(significant_results, Correlation > 0.35 | Correlation < -0.35)

cor_bar_plot <- ggplot(filtered_cor, aes(x = reorder(Variable, Correlation), y = Correlation, fill = Correlation > 0)) +
  geom_bar(stat = "identity", color = "black") +
  scale_fill_manual(values = c("FALSE" = "blue", "TRUE" = "red"), labels = c("Negative", "Positive")) +
  coord_flip() +
  labs(x = "UniProt Acc. Nr.", y = "Correlation") +
  theme_minimal()

Save plot

Reveal code
ggsave("Filtered_Correlation_plot.pdf",plot=cor_bar_plot,device = "pdf",path = here("plots"),width = 7,height = 4)

Export dataset with significant correlations

Reveal code
writexl::write_xlsx(significant_results,path=here("Significant_Correlations.xlsx"))

Sensitivity analysis - Intraluminal thrombus

Read dataset containing patient info

Reveal code
Patient_info <- readxl::read_excel(here("sample_info","Patient_info.xlsx"),sheet=2)

Join patient info with proteomic data

Reveal code
# Join patients present in proteomic data only
AAA_clin_prot <- left_join(AAA_df,Patient_info,by="SampleID")

# Calculate the difference between the outer and inner aortic area
AAA_clin_prot$ILT_area <- AAA_clin_prot$OUTERARE-AAA_clin_prot$INNERARE

# If the difference is larger than 0, then there is an ILT
AAA_clin_prot$ILT_status <- ifelse(is.na(AAA_clin_prot$ILT_area), 0, ifelse(AAA_clin_prot$ILT_area > 0, 1, 0))

Subgroup analyses

Cases - ILT vs non-ILT

Reveal code
# Subset the dataframe where Condition == "Case"
df_case <- AAA_clin_prot[AAA_clin_prot$Condition == "Case", ]

# Initialize an empty dataframe to store results
results_condition_cases <- data.frame(
  Variables = character(),
  Fold_Change = numeric(),
  P_Value = numeric(),
  stringsAsFactors = FALSE
)

# Loop through variables 8:761
for (i in 8:761) {
  variable_name <- colnames(AAA_clin_prot)[i]
  
  # Perform t-test
  t_test <- t.test(df_case[[i]] ~ as.factor(df_case$ILT_status), na.action = na.omit)
  
  # Calculate fold-change
  group_means <- tapply(df_case[[i]], as.factor(df_case$ILT_status), mean, na.rm = TRUE)
  if (length(group_means) == 2) {
    fold_change <- group_means[2] / group_means[1]  # Assuming levels are ordered properly
  } else {
    fold_change <- NA  # Handle cases where one group has no data
  }
  
  # Append results to the dataframe
  results_condition_cases <- rbind(
    results_condition_cases,
    data.frame(
      Variables = variable_name,
       Fold_Change = fold_change,
      P_Value = t_test$p.value,
      stringsAsFactors = FALSE
    )
  )
}

#writexl::write_xlsx(results_condition_cases,path=here("export","T_tests_only_cases.xlsx"))

Controls vs ILT

Reveal code
# Subset the dataframe where ILT_status != 0

index_non_zero_ILT <- AAA_clin_prot$ILT_status == 1 | AAA_clin_prot$Condition =="Ctrl"

# Use the index to subset the dataframe
df_non_zero_ILT <- AAA_clin_prot[index_non_zero_ILT, ]

summary(df_non_zero_ILT$Condition)
Ctrl Case 
  45   35 
Reveal code
# Initialize an empty dataframe to store results
results_condition <- data.frame(
  Variables = character(),
  Fold_Change = numeric(),
  P_Value = numeric(),
  stringsAsFactors = FALSE
)

# Loop through variables 8:761
for (i in 8:761) {
  variable_name <- colnames(AAA_clin_prot)[i]
  
  # Perform t-test
  t_test <- t.test(df_non_zero_ILT[[i]] ~ df_non_zero_ILT$Condition, na.action = na.omit)
  
  # Calculate fold-change
  group_means <- tapply(df_non_zero_ILT[[i]], df_non_zero_ILT$Condition, mean, na.rm = TRUE)
  if (length(group_means) == 2) {
    fold_change <- group_means[2] / group_means[1]  # Assuming levels are ordered properly
  } else {
    fold_change <- NA  # Handle cases where one group has no data
  }
  
  # Append results to the dataframe
  results_condition <- rbind(
    results_condition,
    data.frame(
      Variables = variable_name,
       Fold_Change = fold_change,
      P_Value = t_test$p.value,
      stringsAsFactors = FALSE
    )
  )
}

#writexl::write_xlsx(results_condition,path=here("export","T_tests_w_ILT.xlsx"))

Controls vs non-ILT

Reveal code
# Subset the dataframe where ILT_status = 0

index_zero_ILT <- AAA_clin_prot$ILT_status == 0 | AAA_clin_prot$Condition == "Ctrl"

# Use the index to subset the dataframe
df_zero_ILT <- AAA_clin_prot[index_zero_ILT, ]

summary(df_zero_ILT$Condition)
Ctrl Case 
  45   10 
Reveal code
# Initialize an empty dataframe to store results
results_condition_w_zero <- data.frame(
  Variables = character(),
  Fold_Change = numeric(),
  P_Value = numeric(),
  stringsAsFactors = FALSE
)

# Loop through variables 8:761
for (i in 8:761) {
  variable_name <- colnames(AAA_clin_prot)[i]
  
  # Perform t-test
  t_test <- t.test(df_zero_ILT[[i]] ~ df_zero_ILT$Condition, na.action = na.omit)
  
  # Calculate fold-change
  group_means <- tapply(df_zero_ILT[[i]], df_zero_ILT$Condition, mean, na.rm = TRUE)
  if (length(group_means) == 2) {
    fold_change <- group_means[2] / group_means[1]  # Assuming levels are ordered properly
  } else {
    fold_change <- NA  # Handle cases where one group has no data
  }
  
  # Append results to the dataframe
  results_condition_w_zero <- rbind(
    results_condition_w_zero,
    data.frame(
      Variables = variable_name,
       Fold_Change = fold_change,
      P_Value = t_test$p.value,
      stringsAsFactors = FALSE
    )
  )
}

#writexl::write_xlsx(results_condition_w_zero,path=here("export","T_tests_wo_ILT.xlsx"))

Combine into a single table with original data

Reveal code
# Define the proposed protein biomarkers:
proteins <- c("P02787", "Q96KN2", "P80108", "P08254", 
              "Q9NQZ6", "P60903", "P19021", "Q15517", 
              "Q9UM07", "P10916")

# Define the dataframe names to becombined
dataframes <- list(
  Original = AAA_result_data,
  ILT_neg_vs_ILT_pos = results_condition_cases,
  Ctrl_vs_non_ILT = results_condition_w_zero,
  Ctrl_vs_ILT = results_condition
)

# Initialize an empty list to store the results
results_list <- list()

# Loop through the dataframes and extract the required information
for (df_name in names(dataframes)) {
  df <- dataframes[[df_name]]
  
  # Filter the dataframe to keep only the rows corresponding to the proteins of interest
  filtered_df <- df[df$Variable %in% proteins, ]
  
  # Add a column to identify the source dataframe
  filtered_df$Source <- df_name
  
  # Select and reorder columns (Variable, Fold_Change, P_Value, Source)
  filtered_df <- filtered_df[, c("Variables", "Fold_Change", "P_Value", "Source")]
  
  # Append the filtered dataframe to the results list
  results_list[[df_name]] <- filtered_df
}

# Combine all filtered dataframes into one
final_results <- do.call(rbind, results_list)

rownames(final_results) <- NULL

# Print the final combined dataframe
print(final_results)
   Variables Fold_Change      P_Value             Source
1     Q9NQZ6   2.2456146 4.102711e-06           Original
2     Q9UM07   1.2152493 3.737650e-02           Original
3     P02787   1.1622664 8.372503e-05           Original
4     P19021   0.7134832 9.544842e-03           Original
5     Q96KN2   0.8715908 5.870251e-05           Original
6     Q15517   0.8197757 1.190030e-03           Original
7     P60903   0.7925234 1.906707e-04           Original
8     P80108   0.9043580 7.252563e-04           Original
9     P08254   0.8754197 2.230038e-02           Original
10    P10916   0.8302533 3.718676e-02           Original
11    Q9NQZ6   1.2525570 3.098302e-01 ILT_neg_vs_ILT_pos
12    Q9UM07   0.9922193 9.398884e-01 ILT_neg_vs_ILT_pos
13    P02787   1.0889466 1.069141e-02 ILT_neg_vs_ILT_pos
14    P19021   1.1299037 4.117999e-01 ILT_neg_vs_ILT_pos
15    Q96KN2   0.9557226 3.020364e-01 ILT_neg_vs_ILT_pos
16    Q15517   0.9964212 9.755288e-01 ILT_neg_vs_ILT_pos
17    P60903   1.0334335 6.421862e-01 ILT_neg_vs_ILT_pos
18    P80108   0.9639015 4.096563e-01 ILT_neg_vs_ILT_pos
19    P08254   1.0415046 6.177286e-01 ILT_neg_vs_ILT_pos
20    P10916   0.9245803 2.935460e-01 ILT_neg_vs_ILT_pos
21    Q9NQZ6   1.8769242 4.944919e-02    Ctrl_vs_non_ILT
22    Q9UM07   1.2169299 6.460737e-02    Ctrl_vs_non_ILT
23    P02787   1.0870626 3.604672e-02    Ctrl_vs_non_ILT
24    P19021   0.6920975 1.096333e-02    Ctrl_vs_non_ILT
25    Q96KN2   0.9026771 1.541746e-02    Ctrl_vs_non_ILT
26    Q15517   0.8220640 9.576055e-02    Ctrl_vs_non_ILT
27    P60903   0.7981544 6.729331e-04    Ctrl_vs_non_ILT
28    P80108   0.9304828 9.225938e-02    Ctrl_vs_non_ILT
29    P08254   0.8480437 2.894294e-02    Ctrl_vs_non_ILT
30    P10916   0.8819906 2.033395e-01    Ctrl_vs_non_ILT
31    Q9NQZ6   2.3509547 1.947456e-05        Ctrl_vs_ILT
32    Q9UM07   1.2074614 3.577217e-02        Ctrl_vs_ILT
33    P02787   1.1837531 3.817378e-05        Ctrl_vs_ILT
34    P19021   0.7820035 3.148294e-02        Ctrl_vs_ILT
35    Q96KN2   0.8627090 9.868970e-05        Ctrl_vs_ILT
36    Q15517   0.8191220 1.211698e-03        Ctrl_vs_ILT
37    P60903   0.8248395 1.123985e-03        Ctrl_vs_ILT
38    P80108   0.8968937 8.540914e-04        Ctrl_vs_ILT
39    P08254   0.8832414 5.534414e-02        Ctrl_vs_ILT
40    P10916   0.8154712 2.810096e-02        Ctrl_vs_ILT

Random Forest re-analysis

Original model + ILT status as a variable

Reveal code
# Train the final model with all variables
set.seed(1)

# Set up cross-validation
train_control <- trainControl(
  method = "repeatedcv", 
  repeats = 5,
  number = 10,
  classProbs = TRUE, 
  summaryFunction = twoClassSummary,
  savePredictions = TRUE
)

#Train model with fixed hyperparameters
final_model_w_ITL <- train(Condition ~ P02787+Q96KN2+P80108+P08254+Q9NQZ6+P60903+P19021+Q15517+Q9UM07+P10916+ILT_status,
                     data = AAA_clin_prot[c("Condition",regulated_proteins_names,"ILT_status")], 
                     method = "rf", 
                     trControl = train_control,
                     tuneGrid = data.frame(.mtry = 2)
                     )

# Select a parameter setting
selectedIndices_w_ITL <- final_model_w_ITL$pred$mtry == 2
pROC::auc(roc(final_model_w_ITL$pred$obs[selectedIndices_w_ITL],
         final_model_w_ITL$pred$Case[selectedIndices_w_ITL]))
Area under the curve: 0.9657
Reveal code
pROC::ci.auc(roc(final_model_w_ITL$pred$obs[selectedIndices_w_ITL],
         final_model_w_ITL$pred$Case[selectedIndices_w_ITL]))
95% CI: 0.9492-0.9823 (DeLong)
Reveal code
# Extract predictions for the final model
final_preds_w_ITL <- final_model_w_ITL$pred

# Compute the average ROC for the final model
final_roc_w_ITL <- roc(final_preds_w_ITL$obs, final_preds_w_ITL$Case)

# Extract the confusion matrix
final_model_w_ITL[["finalModel"]][["confusion"]]
     Ctrl Case class.error
Ctrl   43    2  0.04444444
Case    4   41  0.08888889

Original model without ILT patients

Reveal code
# Train the final model with all variables
set.seed(1)

# Set up cross-validation
train_control <- trainControl(
  method = "repeatedcv", 
  repeats = 5,
  number = 10,
  classProbs = TRUE, 
  summaryFunction = twoClassSummary,
  savePredictions = TRUE
)

#Train model with fixed hyperparameters
final_model_wo_ITL <- train(Condition ~ P02787+Q96KN2+P80108+P08254+Q9NQZ6+P60903+P19021+Q15517+Q9UM07+P10916,
                     data = AAA_clin_prot[AAA_clin_prot$ILT_status==0,c("Condition",regulated_proteins_names)], 
                     method = "rf", 
                     trControl = train_control,
                     tuneGrid = data.frame(.mtry = 2)
                     )

# Select a parameter setting
selectedIndices_wo_ITL <- final_model_wo_ITL$pred$mtry == 2
pROC::auc(roc(final_model_wo_ITL$pred$obs[selectedIndices_wo_ITL],
         final_model_wo_ITL$pred$Case[selectedIndices_wo_ITL]))
Area under the curve: 0.8039
Reveal code
pROC::ci.auc(roc(final_model_wo_ITL$pred$obs[selectedIndices_wo_ITL],
         final_model_wo_ITL$pred$Case[selectedIndices_wo_ITL]))
95% CI: 0.7243-0.8834 (DeLong)
Reveal code
# Extract predictions for the final model
final_preds_wo_ITL <- final_model_wo_ITL$pred

# Compute the average ROC for the final model
final_roc_wo_ITL <- roc(final_preds_wo_ITL$obs, final_preds_wo_ITL$Case)

# Extract the confusion matrix
final_model_wo_ITL[["finalModel"]][["confusion"]]
     Ctrl Case class.error
Ctrl   43    2  0.04444444
Case    6    4  0.60000000