Chapter 9 Supp figure 3

Violin plots testing the effect of metadata categories on the 16S beta diversity (Weighted unifrac) paired distances between samples. (A) Displays distances within effect groups (Bufferused, Participant number, Storagedays, and Storagetemp). Paired distances between samples which only differed by the specified effect on the x-axis are included. (B) Displays paired distances between the specified buffer comparisons. Only paired distances between samples with identical Participant number, Storagedays, and Storagetemp are included.

9.1 Libraries

library("phyloseq")
library("microbiome")
library("ggplot2")
#library("plotly")
library("tidyverse")
library("ggforce")
library("ggh4x")
library("patchwork")
library("rbiom")

9.2 16S composition

Carrying out inter distance divergence

We want one boxplot showing difference caused by different factors

  • Temp
  • Buffer
  • Participant
  • Storage days

For Temp we get all identical paired samples that only differ by temp to get those paired distances. We then produce a boxplot from this. We can then compare this bxoplot to the buffer one, participant one, and storage days one in one plot with boxes.

9.2.1 Table of all paired distances

#Load rarefied data created for figure 2A
load("./data/preproces_physeq_even_depth30k")
#Calculate weighted Unifrac values
wunifrac_df <- as.matrix(rbiom::unifrac(biom = microbiome::abundances(physeq),
                                        weighted=TRUE, tree = phyloseq::phy_tree(physeq)))
#Convert doubled distance values and same comparisons to NAs
#Loop through columns starting at the 2nd
r <- 1
for (r in 1:nrow(wunifrac_df)) {
  wunifrac_df[r,r:ncol(wunifrac_df)] <- NA
}
#Get a metdata dataframe with:
# Sample name, Temp, Buffer, Paricipant, and storage days
metadf <- as.data.frame(sample_data(physeq))
columns_of_interest <- c("Sample_name", "Patientnumber","Bufferused", "Storagetemp","DaysstoredpriortoDNAextraction")
metadf <- metadf[,columns_of_interest]
#Change some of the metadata values
metadf$Storagetemp <- gsub(x = metadf$Storagetemp, pattern = "-80" , replacement = "Baseline")
metadf$Storagetemp <- gsub(x = metadf$Storagetemp, pattern = "4" , replacement = "Fridge")
metadf$Storagetemp <- gsub(x = metadf$Storagetemp, pattern = "20" , replacement = "RT")
metadf$DaysstoredpriortoDNAextraction <- gsub(x = metadf$DaysstoredpriortoDNAextraction, pattern = "1" , replacement = "1D")
metadf$DaysstoredpriortoDNAextraction <- gsub(x = metadf$DaysstoredpriortoDNAextraction, pattern = "3" , replacement = "3D")
metadf[metadf$Storagetemp == "Baseline","DaysstoredpriortoDNAextraction"] <- "Baseline"

#Combine metadata and distance matrix
#Ensure the row order is the same
#identical(row.names(metadf), row.names(wunifrac_df)) 
#rbind
wunifrac_metadata_df <- cbind(metadf, wunifrac_df)
#Convert to long
wunifrac_long_df <- tidyr::pivot_longer(data = wunifrac_metadata_df,
                                        #longify the distance values by using the
                                        #col names of the distance df
                                        cols = colnames(wunifrac_df),
                                        names_to = "Sample_name_comp", values_to = "dist")
#Change sample names of comp_samples names to remove _.*
wunifrac_long_df$Sample_name_comp <- gsub(x = wunifrac_long_df$Sample_name_comp, 
                                           pattern = "_.*", replacement = "")
#Remove rows where dist is NA
#Identical sample comparisons and duplicated sample comparisons
wunifrac_long_df <- na.omit(wunifrac_long_df)
#Remove rows where the samples are identical
wunifrac_long_df <- wunifrac_long_df[wunifrac_long_df$Sample_name != wunifrac_long_df$Sample_name_comp,]
#Add metadata for comparison sample
wunifrac_long_df$Patientnumber_comp <-
  as.vector(unlist(metadf[wunifrac_long_df$Sample_name_comp,"Patientnumber"]))
wunifrac_long_df$Bufferused_comp <-
  as.vector(unlist(metadf[wunifrac_long_df$Sample_name_comp,"Bufferused"]))
wunifrac_long_df$Storagetemp_comp <-
  as.vector(unlist(metadf[wunifrac_long_df$Sample_name_comp,"Storagetemp"]))
wunifrac_long_df$DaysstoredpriortoDNAextraction_comp <-
  as.vector(unlist(metadf[wunifrac_long_df$Sample_name_comp,"DaysstoredpriortoDNAextraction"]))
#Move dist to end row
wunifrac_long_df$wunifrac <- wunifrac_long_df$dist
wunifrac_long_df <- subset(wunifrac_long_df, select = -dist)
#Remove unneeded objects
rm(physeq, wunifrac_metadata_df)

9.2.2 Extract the paired distances

We will end up with a data frame with the following columns

  • Distance (wunifrac)
  • Effect (temp, buffer, etc.)
  • Comparison (e.g. Baseline_RT)
  • Sample pair (A1_B1)
#Create an empty dataframe first
dist_df_long <- as.data.frame(matrix(data = NA, ncol = 4, nrow = 0))
colnames(dist_df_long) <- c("wunifrac", "effect", "comparison","sample_pair")

#First for participant
participant_df <- wunifrac_long_df
#create new columns of combined buffer, temp, and days for ease
participant_df$buffer_temp_days <- paste0(participant_df$Bufferused, "_",
                                         participant_df$Storagetemp, "_",
                                         participant_df$DaysstoredpriortoDNAextraction)
participant_df$buffer_temp_days_comp <- paste0(participant_df$Bufferused_comp, "_",
                                         participant_df$Storagetemp_comp, "_",
                                         participant_df$DaysstoredpriortoDNAextraction_comp)
#Keep only rows where these 2 created rows are identical
participant_df <- participant_df[
  participant_df$buffer_temp_days == participant_df$buffer_temp_days_comp,]
#Add effects column
participant_df$effect <- "Participant number"
#Add comparison column
participant_df$comparison <- paste0(participant_df$Patientnumber, "_",
                                   participant_df$Patientnumber_comp)
#Add pairs column
participant_df$sample_pair <- paste0(participant_df$Sample_name, "_", participant_df$Sample_name_comp)
#Create df  to add to main df
tmp_dist_df <- participant_df[,c("wunifrac", "effect", "comparison", "sample_pair")]
#Add to dist data frame
dist_df_long <- rbind(dist_df_long, tmp_dist_df)

#Second is Buffer
buffer_df <- wunifrac_long_df
#create new columns of combined participant, temp, and days for ease
buffer_df$participant_temp_days <- paste0(buffer_df$Patientnumber, "_",
                                         buffer_df$Storagetemp, "_",
                                         buffer_df$DaysstoredpriortoDNAextraction)
buffer_df$participant_temp_days_comp <- paste0(buffer_df$Patientnumber_comp, "_",
                                         buffer_df$Storagetemp_comp, "_",
                                         buffer_df$DaysstoredpriortoDNAextraction_comp)
#Keep only rows where these 2 created rows are identical
buffer_df <- buffer_df[
  buffer_df$participant_temp_days == buffer_df$participant_temp_days_comp,]
#Add effects column
buffer_df$effect <- "Bufferused"
#Add comparison column
buffer_df$comparison <- paste0(buffer_df$Bufferused, "_",
                                   buffer_df$Bufferused_comp)
#Add pairs column
buffer_df$sample_pair <- paste0(buffer_df$Sample_name, "_", buffer_df$Sample_name_comp)
#Create df  to add to main df
tmp_dist_df <- buffer_df[,c("wunifrac", "effect", "comparison", "sample_pair")]
#Add to dist data frame
dist_df_long <- rbind(dist_df_long, tmp_dist_df)

#Third is Storagetemp
temp_df <- wunifrac_long_df
#create new columns of combined participant, buffer, and days for ease
temp_df$participant_buffer_days <- paste0(temp_df$Patientnumber, "_",
                                         temp_df$Bufferused, "_",
                                         temp_df$DaysstoredpriortoDNAextraction)
temp_df$participant_buffer_days_comp <- paste0(temp_df$Patientnumber_comp, "_",
                                         temp_df$Bufferused_comp, "_",
                                         temp_df$DaysstoredpriortoDNAextraction_comp)
#Keep only rows where these 2 created rows are identical
temp_df <- temp_df[
  temp_df$participant_buffer_days == temp_df$participant_buffer_days_comp,]
#Add effects column
temp_df$effect <- "Storagetemp"
#Add comparison column
temp_df$comparison <- paste0(temp_df$Storagetemp, "_",
                                   temp_df$Storagetemp_comp)
#Add pairs column
temp_df$sample_pair <- paste0(temp_df$Sample_name, "_", temp_df$Sample_name_comp)
#Create df  to add to main df
tmp_dist_df <- temp_df[,c("wunifrac", "effect", "comparison", "sample_pair")]
#Add to dist data frame
dist_df_long <- rbind(dist_df_long, tmp_dist_df)

#Fourth is Storagedays
days_df <- wunifrac_long_df
#create new columns of combined participant, buffer, and temp for ease
days_df$participant_buffer_temp <- paste0(days_df$Patientnumber, "_",
                                         days_df$Bufferused, "_",
                                         days_df$Storagetemp)
days_df$participant_buffer_days_temp <- paste0(days_df$Patientnumber_comp, "_",
                                         days_df$Bufferused_comp, "_",
                                         days_df$Storagetemp_comp)
#Keep only rows where these 2 created rows are identical
days_df <- days_df[
  days_df$participant_buffer_temp == days_df$participant_buffer_days_temp,]
#Add effects column
days_df$effect <- "Storagedays"
#Add comparison column
days_df$comparison <- paste0(days_df$DaysstoredpriortoDNAextraction, "_",
                                   days_df$DaysstoredpriortoDNAextraction_comp)
#Add pairs column
days_df$sample_pair <- paste0(days_df$Sample_name, "_", days_df$Sample_name_comp)
#Create df  to add to main df
tmp_dist_df <- days_df[,c("wunifrac", "effect", "comparison", "sample_pair")]
#Add to dist data frame
dist_df_long <- rbind(dist_df_long, tmp_dist_df)

#Remove unwanted objects
rm(wunifrac_long_df, participant_df, buffer_df, temp_df, days_df, tmp_dist_df)
#Create violin plot
alpha_violin <- ggplot(dist_df_long, aes(y = wunifrac, x = effect)) +
                ggplot2::geom_violin() +
                ggforce::geom_sina(alpha=0.5) +
                labs(x = "Effect", y = "Weighted unifrac distance")

9.2.3 Buffer deep dive

Here we want to look at the differences between the buffers. We want boxplots with the same values as the above one but separated by the comparison e.g.

  • No buffer vs Ethanol
  • No buffer vs PSP
  • etc.
#Subset our data to only contian the buffer effect info
buffer_dist_df_long <- dist_df_long[dist_df_long$effect == "Bufferused", ]
#Change names and order of levels
buffer_dist_df_long$comparison <- gsub(pattern = "Ethanol_No buffer",
                                       replacement = "No buffer & Ethanol",
                                       x = buffer_dist_df_long$comparison)
buffer_dist_df_long$comparison <- gsub(pattern = "Ethanol_PSP",
                                       replacement = "Ethanol & PSP",
                                       x = buffer_dist_df_long$comparison)
buffer_dist_df_long$comparison <- gsub(pattern = "Ethanol_RNAlater",
                                       replacement = "Ethanol & RNAlater",
                                       x = buffer_dist_df_long$comparison)
buffer_dist_df_long$comparison <- gsub(pattern = "PSP_No buffer",
                                       replacement = "No buffer & PSP",
                                       x = buffer_dist_df_long$comparison)
buffer_dist_df_long$comparison <- gsub(pattern = "RNAlater_No buffer",
                                       replacement = "No buffer & RNAlater",
                                       x = buffer_dist_df_long$comparison)
buffer_dist_df_long$comparison <- gsub(pattern = "RNAlater_PSP",
                                       replacement = "PSP & RNAlater",
                                       x = buffer_dist_df_long$comparison)
#Change to factor and reorder
buffer_dist_df_long$comparison <- factor(buffer_dist_df_long$comparison, 
       levels = c("No buffer & Ethanol", "No buffer & PSP", "No buffer & RNAlater",
                  "Ethanol & PSP", "Ethanol & RNAlater", "PSP & RNAlater"))
#Create violin plot
buffer_alpha_violin <- ggplot(buffer_dist_df_long, aes(y = wunifrac, x = comparison)) +
                ggplot2::geom_violin() +
                ggforce::geom_sina(alpha=0.5) +
                labs(x = "Bufferused comparison", y = "Weighted unifrac distance")

9.2.4 Combined plot

Combine the 2 plots into one figure

combined_alpha_violin <- alpha_violin / buffer_alpha_violin
ggsave(filename = "./figures/16s_beta_intergroup_divergence_violin_plots.png", 
       plot = combined_alpha_violin,
       device = "png", dpi = 300, units = "mm", height = 250, width = 250)