Chapter 2 Importing and preocessing data
2.1 qiime2R import
#Load qiime2R library
library(qiime2R)
library(phyloseq)
library(tidyverse)
#Convert QIIME2 artifacts into phyloseq object
physeq<-qza_to_phyloseq(
features="./data/table.nocontrol.minfreq4628.qza",
tree="./data/rooted-tree.qza",
taxonomy="./data/taxonomy.qza",
metadata = "./data/qiime2_metadata.nocontrols.v4.tsv"
)
#Some oRikenellaceae are given the genera human and the species gut metagenome
#Change genera "human" to NA
#change species "gut metagenome" to NA
tax_table(physeq)[,"Genus"] <- gsub(tax_table(physeq)[,"Genus"], pattern = "human", replacement = NA)
tax_table(physeq)[,"Species"] <- gsub(tax_table(physeq)[,"Species"], pattern = "gut metagenome", replacement = NA)
#save physeq object
save(physeq, file = "./data/physeq")
2.2 Preprocess data
#libraries
library("phyloseq")
library("microbiome")
library("ggplot2")
#load in phyloseq object
load("./data/physeq")
#subset samples to only keep rnalater samples
physeq <- subset_samples(physeq, !(RNAlater_washed_status == "unwashed"))
#Change names of RNA later samples
sample_name_vec <- unlist(as.vector(sample_data(physeq)[,1]))
sample_name_vec_corrected <- gsub(pattern = "_b", replacement = "",sample_name_vec)
sample_data(physeq)[,1] <- sample_name_vec_corrected
#In metadata add column with buffer and storage combined
sample_data(physeq)[,"Bufferused_and_Storageconditions"] <-
paste0(unlist(sample_data(physeq)[,"Bufferused"]),
"_",
unlist(sample_data(physeq)[,"Storageconditions"]))
save(physeq, file = "./data/preprocess_physeq")
#
# preprocess ####
#transform samples to relabund
physeq_relabund <- transform_sample_counts(physeq, function(x) x / sum(x))
#num ASVs b4 removal of low ASVs = 3965
# nrow(otu_table(physeq_relabund))
#first remove ASV with relabund equal to 0
physeq_relabund <- filter_taxa(physeq_relabund, function(x) sum(x) > 0, TRUE)
#current num ASVs = 3626
# nrow(otu_table(physeq_relabund))
#remove ASVs with a mean less than 1e-5 (relabund)
physeq_relabund <- filter_taxa(physeq_relabund, function(x) mean(x) > 1e-5, TRUE)
#current num ASVs = 1406
# nrow(otu_table(physeq_relabund))
#Let us see how much relabund we lost
#colSums(otu_table(physeq_relabund))
#all samples have greater than 0.98 relabund remaining so it looks good
#In metadata add column with buffer and storage combined
sample_data(physeq_relabund)[,"Bufferused_and_Storageconditions"] <-
paste0(unlist(sample_data(physeq_relabund)[,"Bufferused"]),
"_",
unlist(sample_data(physeq_relabund)[,"Storageconditions"]))
#save physeq object
save(physeq_relabund, file = "./data/physeq_relabund")
2.3 Lipid data
#libraries
library("phyloseq")
#Read in data
df <- read.csv(file = "data/Pilot SCFA data - MG.csv", check.names = FALSE,
row.names = 1)
#Sample names
sample_names <- colnames(df)
#compounds
compound_names <- row.names(df[6:14,])
#First create OTU matrix
otu_df <- df[6:14,]
num_otu_df <- data.frame(apply(otu_df, 2, function(x) as.numeric(as.character(x))))
otu_mat <- data.matrix(num_otu_df)
rownames(otu_mat) <- paste0("OTU", 1:nrow(otu_mat))
colnames(otu_mat) <- paste0("Sample", 1:ncol(otu_mat))
#Tax matrix
tax_mat <- otu_mat[,1:2]
tax_mat[,1] <- compound_names
tax_mat[,2] <- compound_names
colnames(tax_mat) <- c("Compound","X")
#Sample_data_frame
metadf <- data.frame(lapply(df[1:5,], as.character), stringsAsFactors=FALSE)
row.names(metadf) <- row.names(df[1:5,])
metadf["Sample_name",] <- colnames(df)
#Change buffer labels
metadf[2,] <- gsub(pattern = "None", replacement = "No buffer", metadf[2,])
metadf[2,] <- gsub(pattern = "70% EtOH", replacement = "Ethanol", metadf[2,])
metadf[2,] <- gsub(pattern = "RNA later", replacement = "RNAlater", metadf[2,])
colnames(metadf) <- colnames(otu_mat)
t_metadf <- data.frame(t(metadf), stringsAsFactors=FALSE)
t_metadf[t_metadf$Storage.time == 0,"Temp"] <- "Baseline"
t_metadf[,"Buffer.type.Temp"] <- paste(t_metadf[,"Buffer.type"],t_metadf[,"Temp"],sep="|")
#Temp and time column
t_metadf[,"Temp.Storage.time"] <-
paste(t_metadf[,"Temp"],t_metadf[,"Storage.time"],sep="|")
#Change Baseline|0 to Baseline
t_metadf[,"Temp.Storage.time"] <-
gsub(pattern = "Baseline\\|0", replacement = "Baseline",
x = t_metadf[,"Temp.Storage.time"])
#Order of temp
t_metadf[,"Temp"] <-
factor(t_metadf[,"Temp"], levels = c("Baseline", "RT", "Fridge"))
#Sample column
t_metadf$`Sample name` <- t_metadf$Sample_name
#Create phyloseq object
OTU <- otu_table(otu_mat, taxa_are_rows = TRUE)
TAX <- tax_table(tax_mat)
sampledata <- sample_data(data.frame(t_metadf, stringsAsFactors=FALSE))
physeq <- phyloseq(OTU,TAX,sampledata)
#remove samples with no otu numbers
physeq = subset_samples(physeq, Sample_name != "6Q")
physeq = subset_samples(physeq, Sample_name != "6R")
physeq = subset_samples(physeq, Sample_name != "6S")
physeq = subset_samples(physeq, Sample_name != "6T")
physeq = subset_samples(physeq, Sample_name != "6P")
#save physeq object
save(physeq, file = "data/lipid_physeq_object")