Chapter 2 Importing and preocessing data

2.1 qiime2R import

#Load qiime2R library
library(qiime2R)
library(phyloseq)
library(tidyverse)

#Convert QIIME2 artifacts into phyloseq object
physeq<-qza_to_phyloseq(
  features="./data/table.nocontrol.minfreq4628.qza",
  tree="./data/rooted-tree.qza",
  taxonomy="./data/taxonomy.qza",
  metadata = "./data/qiime2_metadata.nocontrols.v4.tsv"
)

#Some oRikenellaceae are given the genera human and the species gut metagenome
#Change genera "human" to NA
#change species "gut metagenome" to NA
tax_table(physeq)[,"Genus"] <- gsub(tax_table(physeq)[,"Genus"], pattern = "human", replacement = NA)
tax_table(physeq)[,"Species"] <- gsub(tax_table(physeq)[,"Species"], pattern = "gut metagenome", replacement = NA)

#save physeq object
save(physeq, file = "./data/physeq")

2.2 Preprocess data

#libraries
library("phyloseq")
library("microbiome")
library("ggplot2")

#load in phyloseq object
load("./data/physeq")

#subset samples to only keep rnalater samples
physeq <- subset_samples(physeq, !(RNAlater_washed_status == "unwashed"))

#Change names of RNA later samples
sample_name_vec <- unlist(as.vector(sample_data(physeq)[,1]))
sample_name_vec_corrected <- gsub(pattern = "_b", replacement = "",sample_name_vec)
sample_data(physeq)[,1] <- sample_name_vec_corrected

#In metadata add column with buffer and storage combined
sample_data(physeq)[,"Bufferused_and_Storageconditions"] <- 
  paste0(unlist(sample_data(physeq)[,"Bufferused"]),
         "_",
         unlist(sample_data(physeq)[,"Storageconditions"]))

save(physeq, file = "./data/preprocess_physeq")

#

# preprocess ####
#transform samples to relabund
physeq_relabund <- transform_sample_counts(physeq, function(x) x / sum(x))
#num ASVs b4 removal of low ASVs = 3965
# nrow(otu_table(physeq_relabund))
#first remove ASV with relabund equal to 0
physeq_relabund <-  filter_taxa(physeq_relabund, function(x) sum(x) > 0, TRUE)
#current num ASVs = 3626
# nrow(otu_table(physeq_relabund))
#remove ASVs with a mean less than 1e-5 (relabund)
physeq_relabund <-  filter_taxa(physeq_relabund, function(x) mean(x) > 1e-5, TRUE)
#current num ASVs = 1406
# nrow(otu_table(physeq_relabund))
#Let us see how much relabund we lost
#colSums(otu_table(physeq_relabund))
#all samples have greater than 0.98 relabund remaining so it looks good

#In metadata add column with buffer and storage combined
sample_data(physeq_relabund)[,"Bufferused_and_Storageconditions"] <- 
  paste0(unlist(sample_data(physeq_relabund)[,"Bufferused"]),
         "_",
         unlist(sample_data(physeq_relabund)[,"Storageconditions"]))
#save physeq object
save(physeq_relabund, file = "./data/physeq_relabund")

2.3 Lipid data

#libraries
library("phyloseq")

#Read in data
df <- read.csv(file = "data/Pilot SCFA data - MG.csv", check.names = FALSE,
                row.names = 1)
#Sample names
sample_names <- colnames(df)
#compounds
compound_names <- row.names(df[6:14,])


#First create OTU matrix
otu_df <- df[6:14,]
num_otu_df <- data.frame(apply(otu_df, 2, function(x) as.numeric(as.character(x))))
otu_mat <- data.matrix(num_otu_df)
rownames(otu_mat) <- paste0("OTU", 1:nrow(otu_mat))
colnames(otu_mat) <- paste0("Sample", 1:ncol(otu_mat))

#Tax matrix
tax_mat <- otu_mat[,1:2]
tax_mat[,1] <- compound_names
tax_mat[,2] <- compound_names
colnames(tax_mat) <- c("Compound","X")

#Sample_data_frame
metadf <- data.frame(lapply(df[1:5,], as.character), stringsAsFactors=FALSE)
row.names(metadf) <- row.names(df[1:5,])
metadf["Sample_name",] <- colnames(df)
#Change buffer labels
metadf[2,] <- gsub(pattern = "None", replacement = "No buffer", metadf[2,])
metadf[2,] <- gsub(pattern = "70% EtOH", replacement = "Ethanol", metadf[2,])
metadf[2,] <- gsub(pattern = "RNA later", replacement = "RNAlater", metadf[2,])

colnames(metadf) <- colnames(otu_mat)
t_metadf <- data.frame(t(metadf), stringsAsFactors=FALSE)
t_metadf[t_metadf$Storage.time == 0,"Temp"] <- "Baseline"
t_metadf[,"Buffer.type.Temp"] <- paste(t_metadf[,"Buffer.type"],t_metadf[,"Temp"],sep="|")
#Temp and time column
t_metadf[,"Temp.Storage.time"] <- 
  paste(t_metadf[,"Temp"],t_metadf[,"Storage.time"],sep="|")
#Change Baseline|0 to Baseline
t_metadf[,"Temp.Storage.time"] <- 
  gsub(pattern = "Baseline\\|0", replacement = "Baseline", 
       x = t_metadf[,"Temp.Storage.time"])
#Order of temp
t_metadf[,"Temp"] <- 
  factor(t_metadf[,"Temp"], levels = c("Baseline", "RT", "Fridge"))
#Sample column
t_metadf$`Sample name` <- t_metadf$Sample_name


#Create phyloseq object
OTU <- otu_table(otu_mat, taxa_are_rows = TRUE)
TAX <- tax_table(tax_mat)
sampledata <- sample_data(data.frame(t_metadf, stringsAsFactors=FALSE))
physeq <- phyloseq(OTU,TAX,sampledata)

#remove samples with no otu numbers
physeq = subset_samples(physeq, Sample_name != "6Q")
physeq = subset_samples(physeq, Sample_name != "6R")
physeq = subset_samples(physeq, Sample_name != "6S")
physeq = subset_samples(physeq, Sample_name != "6T")
physeq = subset_samples(physeq, Sample_name != "6P")

#save physeq object
save(physeq, file = "data/lipid_physeq_object")