--- title: "Canonical Correspondence Analysis (CCA) and Redundancy Analysis (RDA) for Protein Abundance Data" author: "Yaamini Venkataraman" date: "November 27, 2018" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) #Set up R Markdown File ``` In this script, I'll perform a constrained ordination to understand the protein abundance data given the environmental data. # Install dependencies ```{r} #install.packages("vegan") require(vegan) #install.packages("cluster") #Install cluster package. This package has the function daisy that is used for the gower dissimilarity matrix. require(cluster) #install.packages("dplyr") require(dplyr) source("../../biostats.R") #Biostats analysis wrapper ``` # Obtain session information ```{r} sessionInfo() #Obtain session information ``` # Import and reformat data ## Protein abundance data ```{r} proteinAbundance <- read.csv("2017-11-05-Averaged-Areas-Pivoted-Corrected.csv", header = TRUE) #Import protein abundance data row.names(proteinAbundance) <- proteinAbundance[,1] #Assign first column as rownames proteinAbundance <- proteinAbundance[,-1] #Remove first column proteinAbundance <- t(proteinAbundance) #Transpose dataframe head(proteinAbundance) #Confirm changes ``` ### Transform protein data ```{r} proteinAbundanceHT <- data.trans(proteinAbundance, method = 'hellingers', plot = FALSE) #Hellinger (asymmetric) transformation head(proteinAbundanceHT) #Confirm transformation ``` ## Environmental data For the environmental data, I want my rownames to be oyster samples, and my columns to be Site, Habitat, and the mean and variance of environmental variables at the designated Site and Habitat for the entire outplant. ### Calculate outplant means and variances #### pH ```{r} pHData <- read.csv("../../2017-11-15-Environmental-Data-and-Biomarker-Analyses/2017-12-13-Environmental-Data-Quality-Control/2018-11-18-pH-Tide-Data-Corrected.csv", header = TRUE) #Import pH data pHData <- pHData[,-1] #Remove extra first column head(pHData) #Confirm import ``` ```{r} colnames(pHData) ``` ```{r} pHMeanVariance <- data.frame("Site-Habitat" = colnames(pHData[4:12]), "Site" = c("WB", "WB", "SK", "SK", "PG", "CI", "CI", "FB", "FB"), "Habitat" = c("Eelgrass", "Bare", "Eelgrass", "Bare", "Bare", "Eelgrass", "Bare", "Eelgrass", "Bare"), "pHMean" = rep(0, times = 9), "pHVariance" = rep(0, times = 9)) #Create an empty dataframe to store outplant-wide means and variances head(pHMeanVariance) #Confirm dataframe creation ``` ```{r} nSiteHabitat <- 12 #pH data is column 4-12 for(i in 4:nSiteHabitat) { #For each column with pH data pHMeanVariance$pHMean[i-3] <- mean(pHData[,i], na.rm = TRUE) #Calculate the mean and put it in the designated row in pHMeanVariance$Mean } for(i in 4:nSiteHabitat) { #For each column with pH data pHMeanVariance$pHVariance[i-3] <- var(pHData[,i], na.rm = TRUE) #Calculate the variance and put it in the designated row in pHMeanVariance$Variance } pHMeanVariance <- pHMeanVariance[,-1] #Remove Site.Habitat column head(pHMeanVariance) #Confirm calculations ``` #### Dissolved oxygen ```{r} DOData <- read.csv("../../2017-11-15-Environmental-Data-and-Biomarker-Analyses/2017-12-13-Environmental-Data-Quality-Control/2018-11-18-DO-Tide-Data-Corrected.csv", header = TRUE) #Import DO data DOData <- DOData[,-1] #Remove extra first column head(DOData) #Confirm import ``` ```{r} colnames(DOData) #Look at columns to identify where the DO data is ``` ```{r} DOMeanVariance <- data.frame("Site-Habitat" = colnames(DOData[4:13]), "Site" = c("WB", "WB", "SK", "SK", "PG", "PG", "CI", "CI", "FB", "FB"), "Habitat" = c("Eelgrass", "Bare", "Eelgrass", "Bare", "Eelgrass", "Bare", "Eelgrass", "Bare", "Eelgrass", "Bare"), "DOMean" = rep(0, times = 10), "DOVariance" = rep(0, times = 10)) #Create an empty dataframe to store outplant-wide means and variances head(DOMeanVariance) #Confirm dataframe creation ``` ```{r} nSiteHabitat <- 13 #DO data is column 4-13 for(i in 4:nSiteHabitat) { #For each column with DO data DOMeanVariance$DOMean[i-3] <- mean(DOData[,i], na.rm = TRUE) #Calculate the mean and put it in the designated row in DOMeanVariance$DOMean } for(i in 4:nSiteHabitat) { #For each column with pH data DOMeanVariance$DOVariance[i-3] <- var(DOData[,i], na.rm = TRUE) #Calculate the variance and put it in the designated row in DOMeanVariance$DOVariance } DOMeanVariance <- DOMeanVariance[,-1] #Remove Site.Habitat column head(DOMeanVariance) #Confirm calculations ``` #### Salinity ```{r} salinityData <- read.csv("../../2017-11-15-Environmental-Data-and-Biomarker-Analyses/2017-12-13-Environmental-Data-Quality-Control/2018-11-18-Salinity-Tide-Data-Corrected.csv", header = TRUE) #Import pH data salinityData <- salinityData[,-1] #Remove extra first column head(salinityData) #Confirm import ``` ```{r} colnames(salinityData) ``` ```{r} salinityMeanVariance <- data.frame("Site-Habitat" = colnames(salinityData[4:11]), "Site" = c("CI", "CI", "FB", "FB", "PG", "SK", "SK", "WB"), "Habitat" = c("Bare", "Eelgrass", "Bare", "Eelgrass", "Eelgrass", "Bare", "Eelgrass", "Bare"), "salinityMean" = rep(0, times = 8), "salinityVariance" = rep(0, times = 8)) #Create an empty dataframe to store outplant-wide means and variances head(salinityMeanVariance) #Confirm dataframe creation ``` ```{r} nSiteHabitat <- 11 #pH data is column 4-11 for(i in 4:nSiteHabitat) { #For each column with pH data salinityMeanVariance$salinityMean[i-3] <- mean(salinityData[,i], na.rm = TRUE) #Calculate the mean and put it in the designated row } for(i in 4:nSiteHabitat) { #For each column with pH data salinityMeanVariance$salinityVariance[i-3] <- var(salinityData[,i], na.rm = TRUE) #Calculate the variance and put it in the designated row } salinityMeanVariance <- salinityMeanVariance[,-1] #Remove Site.Habitat column head(salinityMeanVariance) #Confirm calculations ``` #### Temperature ```{r} temperatureData <- read.csv("../../2017-11-15-Environmental-Data-and-Biomarker-Analyses/2017-12-13-Environmental-Data-Quality-Control/2018-11-18-Temperature-Corrected.csv", header = TRUE) #Import temperature data temperatureData <- temperatureData[,-1] #Remove extra first column head(temperatureData) #Confirm import ``` ```{r} colnames(temperatureData) #Figure out where data is ``` ```{r} temperatureMeanVariance <- data.frame("Site-Habitat" = colnames(temperatureData[3:12]), "Site" = c("WB", "WB", "SK", "SK", "PG", "PG", "CI", "CI", "FB", "FB"), "Habitat" = c("Eelgrass", "Bare", "Eelgrass", "Bare", "Eelgrass", "Bare", "Eelgrass", "Bare", "Eelgrass", "Bare"), "temperatureMean" = rep(0, times = 10), "temperatureVariance" = rep(0, times = 10)) #Create an empty dataframe to store outplant-wide means and variances head(temperatureMeanVariance) #Confirm dataframe creation ``` ```{r} nSiteHabitat <- 12 #pH data is column 3-12 for(i in 3:nSiteHabitat) { #For each column with pH data temperatureMeanVariance$temperatureMean[i-2] <- mean(temperatureData[,i], na.rm = TRUE) #Calculate the mean and put it in the designated row } for(i in 3:nSiteHabitat) { #For each column with pH data temperatureMeanVariance$temperatureVariance[i-2] <- var(temperatureData[,i], na.rm = TRUE) #Calculate the variance and put it in the designated row } temperatureMeanVariance <- temperatureMeanVariance[,-1] #Remove Site.Habitat column head(temperatureMeanVariance) #Confirm calculations ``` ### Merge environmental data ```{r} environmentalData <- left_join(temperatureMeanVariance, pHMeanVariance, by = c("Site", "Habitat")) #Use left_join to merge dataframes and add NAs where the two dataframes do not match. temperatureData is used as the base because it has observations for all 10 Site and Habitat combinations environmentalData <- left_join(environmentalData, DOMeanVariance, by = c("Site", "Habitat")) #Add salinity data environmentalData <- left_join(environmentalData, salinityMeanVariance, by = c("Site", "Habitat")) #Add temperature data head(environmentalData) ``` ### Combine oyster samples with environmental data ```{r} biologicalReplicates <- read.csv("../../2017-09-06-Biological-Replicate-Information.csv", header = TRUE) #Import biological replciate information head(biologicalReplicates) #Confirm import ``` ```{r} biologicalReplicates$Sample.Number <- as.character(biologicalReplicates$Sample.Number) #Convert sample number to character string biologicalReplicates$Sample.Number <- substr(biologicalReplicates$Sample.Number, 1, nchar(biologicalReplicates$Sample.Number)-2) #Remove -1 or -2 from end of sample number biologicalReplicates <- biologicalReplicates[1:49,] #Keep only the first 50 rows, since everything repeats. Also eliminate OBLNK2 colnames(biologicalReplicates)[3] <- "Habitat" #Change Eelgrass.Condition to Habitat tail(biologicalReplicates) #Confirm changes ``` ```{r} colnames(environmentalData) ``` ```{r} environmentalSampleData <- data.frame("Sample.Number" = biologicalReplicates$Sample.Number, "Site" = biologicalReplicates$Site, "Habitat" = biologicalReplicates$Habitat, "temperatureMean" = rep(0, times = length(biologicalReplicates$Sample.Number)), "temperatureVariance" = rep(0, times = length(biologicalReplicates$Sample.Number)), "pHMean" = rep(0, times = length(biologicalReplicates$Sample.Number)), "pHVariance" = rep(0, times = length(biologicalReplicates$Sample.Number)), "DOMean" = rep(0, times = length(biologicalReplicates$Sample.Number)), "DOVariance" = rep(0, times = length(biologicalReplicates$Sample.Number)), "salinityMean" = rep(0, times = length(biologicalReplicates$Sample.Number)), "salinityVariance" = rep(0, times = length(biologicalReplicates$Sample.Number))) #Create a new dataframe for sample and environmental data. Column names follow the same ordering as environmentalData head(environmentalSampleData) #Confirm dataframe creation ``` ```{r} colnames(environmentalSampleData) ``` ```{r} environmentalSampleData[environmentalSampleData$Site == "CI" & environmentalSampleData$Habitat == "Bare", 4:11] ``` ```{r} environmentalData[environmentalData$Site == "CI" & environmentalData$Habitat == "Bare",3:10] ``` ```{r} #Replace 0s in environmentalSampleData using corresponding information from environmentalData #Case Inlet environmentalSampleData[environmentalSampleData$Site == "CI" & environmentalSampleData$Habitat == "Bare", 4:11] <- environmentalData[environmentalData$Site == "CI" & environmentalData$Habitat == "Bare", 3:10] environmentalSampleData[environmentalSampleData$Site == "CI" & environmentalSampleData$Habitat == "Eelgrass", 4:11] <- environmentalData[environmentalData$Site == "CI" & environmentalData$Habitat == "Eelgrass", 3:10] #Fidalgo Bay environmentalSampleData[environmentalSampleData$Site == "FB" & environmentalSampleData$Habitat == "Bare", 4:11] <- environmentalData[environmentalData$Site == "FB" & environmentalData$Habitat == "Bare", 3:10] environmentalSampleData[environmentalSampleData$Site == "FB" & environmentalSampleData$Habitat == "Eelgrass", 4:11] <- environmentalData[environmentalData$Site == "FB" & environmentalData$Habitat == "Eelgrass", 3:10] #Port Gamble Bay environmentalSampleData[environmentalSampleData$Site == "PG" & environmentalSampleData$Habitat == "Bare", 4:11] <- environmentalData[environmentalData$Site == "PG" & environmentalData$Habitat == "Bare", 3:10] environmentalSampleData[environmentalSampleData$Site == "PG" & environmentalSampleData$Habitat == "Eelgrass", 4:11] <- environmentalData[environmentalData$Site == "PG" & environmentalData$Habitat == "Eelgrass", 3:10] #Skokomish River Delta environmentalSampleData[environmentalSampleData$Site == "SK" & environmentalSampleData$Habitat == "Bare", 4:11] <- environmentalData[environmentalData$Site == "SK" & environmentalData$Habitat == "Bare", 3:10] environmentalSampleData[environmentalSampleData$Site == "SK" & environmentalSampleData$Habitat == "Eelgrass", 4:11] <- environmentalData[environmentalData$Site == "SK" & environmentalData$Habitat == "Eelgrass", 3:10] #Willapa Bay environmentalSampleData[environmentalSampleData$Site == "WB" & environmentalSampleData$Habitat == "Bare", 4:11] <- environmentalData[environmentalData$Site == "WB" & environmentalData$Habitat == "Bare", 3:10] environmentalSampleData[environmentalSampleData$Site == "WB" & environmentalSampleData$Habitat == "Eelgrass", 4:11] <- environmentalData[environmentalData$Site == "WB" & environmentalData$Habitat == "Eelgrass", 3:10] head(environmentalSampleData) #Confirm replacements happened ``` ### Transform environmental data ```{r} row.names(environmentalSampleData) <- environmentalSampleData$Sample.Number #Assign Sample.Number as row names environmentalSampleData <- environmentalSampleData[,-1] #Remove Sample.Number column head(environmentalSampleData) #Confirm changes ``` ```{r} environmentalSampleData.log <- log(environmentalSampleData[,-c(1:2)] + 1) #Log transform all columns that aren't Site and Habitat head(environmentalSampleData.log) #Confirm changes ``` ```{r} environmentalSampleData.log <- environmentalSampleData.log[rownames(proteinAbundanceHT),] #Ensure that samples in environmental data are those from the protein abundance data length(environmentalSampleData.log$temperatureMean) == length(proteinAbundanceHT$`CHOYP_ACAA2.1.1|m.30666 ELGLNNDITNMNGGAIALGHPLAASGTR`) #The column lengths are equal, so we are good to go. ``` ```{r} #write.csv(environmentalSampleData.log, "2018-12-01-Log-Transformed-Daily-Environmental-Values.csv", row.names = TRUE) #Write out dataframe for future analyses. ``` # Determine the appropriate response model ```{r} decorana(proteinAbundanceHT, ira = 0) #Determine gradient length. Gradient length <2 is best represented by a linear model (RDA), 2-4 could be unimodal (RDA or CCA appropriate), and >4 is greater than unimodal (CCA). The length of DCA1 is 0.372921, so the underlying model is linear. I will use RDA. ``` # Conducting RDA ```{r} protEnvRDA <- rda(proteinAbundanceHT~., environmentalSampleData.log, na.action = na.exclude) summary(protEnvRDA) #Look at the summary. Total inertia is the total variation explained by the RDA. Constrained inertia is the variation explained in Y by X. Unconstrained inertia is everything else. ``` ## Significance tests ```{r} anova(protEnvRDA) #Test significance of the RDA in general. F = 1.3064, p = 0.195, so the constrained ordination is not significant. ``` ```{r} anova(protEnvRDA, by = 'axis') #Test significance of each axis. No axis is significant. ``` ```{r} anova(protEnvRDA, by = 'terms') #Test significance of each predictor variable from X. The mean and variance of temperature are marginally significant (p = 0.065 and 0.067 respectively) ``` ## Ordination biplot ```{r} plot(protEnvRDA, choices = c(1,2), type = 'points', display = 'wa', scaling = 2) #Visualize the ordination by plotting the weighted average scores. Scaling method optimizes for descriptors. ``` ```{r} plot(protEnvRDA, choices = c(1,2), type = 'n', display = 'wa', scaling = 2) #Visualize the WA scores text(protEnvRDA, choices = c(1,2), labels = row.names(proteinAbundanceHT)) #Add sample ID instead of points ``` # Interpreting the constrained ordination ```{r} round(intrasetcor(protEnvRDA), 5) #Obtain the intra-set correlation (structure) coefficients ``` ```{r} round(intersetcor(protEnvRDA), 5) #Obtain the inter-set correlation (structure) coefficients ``` ```{r} summary(protEnvRDA) #Get biplot scores for explanatory variables ``` ```{r} plot(protEnvRDA, choices = c(1,2), display = c('wa', 'sp', 'bp'), scaling = 2) #Obtain the triplot with weighted average scores (wa), species scores (sp), and environmental variable biplot scores (bp). Scale to enhance descriptors. ``` #Customize RDA ## Create a customization matrix ```{r} temporaryData <- data.frame("Sample.Number" = biologicalReplicates$Sample.Number, y = rep(x = 0, times = length(biologicalReplicates$Sample.Number))) #Create a temporary dataframe with sample names head(temporaryData) #Confirm dataframe creation ``` ```{r} NMDSColorShapeCustomization <- merge(x = temporaryData, y = biologicalReplicates, by = "Sample.Number") #Merge biological information with samples used NMDSColorShapeCustomization <- NMDSColorShapeCustomization[,-2] #Remove empty column head(NMDSColorShapeCustomization) #Confirm removal ``` ```{r} #Add region information (Puget Sound vs. Willapa Bay) attach(NMDSColorShapeCustomization) NMDSColorShapeCustomization <- NMDSColorShapeCustomization[order(Site),] #Reorder so sites are sorted alphabetically head(NMDSColorShapeCustomization) #Confirm sorting detach(NMDSColorShapeCustomization) NMDSColorShapeCustomization$Region <- c(rep("PS", times = (length(NMDSColorShapeCustomization$Site)-6)), rep("WB", times = 6)) #Add regional information NMDSColorShapeCustomization$NMDS.Region.Shapes <- c(rep(20, times = (length(NMDSColorShapeCustomization$Site)-6)), rep(8, times = 6)) head(NMDSColorShapeCustomization) #Confirm changes tail(NMDSColorShapeCustomization) #Confirm changes ``` ```{r} #Create a color and shape palette attach(NMDSColorShapeCustomization) NMDSColorShapeCustomization <- NMDSColorShapeCustomization[order(Site),] #Reorder so sites are sorted alphabetically head(NMDSColorShapeCustomization) #Confirm sorting detach(NMDSColorShapeCustomization) NMDS.Colors <- c(rep(x = "#00A9BD", times = sum(NMDSColorShapeCustomization$Site == "CI")), rep(x = "#38001C", times = sum(NMDSColorShapeCustomization$Site == "FB")), rep(x = "#440D82", times = sum(NMDSColorShapeCustomization$Site == "PG")), rep(x = "#017A74", times = sum(NMDSColorShapeCustomization$Site == "SK")), rep(x = "#EB8B0C", times = sum(NMDSColorShapeCustomization$Site == "WB"))) #Create a color vector NMDSColorShapeCustomization[,6] <- NMDS.Colors #Add the color vector to the dataframe head(NMDSColorShapeCustomization) #Confirm addition attach(NMDSColorShapeCustomization) NMDSColorShapeCustomization <- NMDSColorShapeCustomization[order(Habitat),] #Reorder so habitat is sorted alphabetically head(NMDSColorShapeCustomization) #Confirm sorting detach(NMDSColorShapeCustomization) NMDS.Shapes <- c(rep(x = 1, times = sum(NMDSColorShapeCustomization$Habitat == "Bare")), rep(x = 16, times = sum(NMDSColorShapeCustomization$Habitat == "Eelgrass"))) #Make a shape vector NMDSColorShapeCustomization[,7] <- NMDS.Shapes #Add the shape vector to the dataframe head(NMDSColorShapeCustomization) #Confirm addition attach(NMDSColorShapeCustomization) NMDSColorShapeCustomization <- NMDSColorShapeCustomization[order(Sample.Number),] #Resort by sample number head(NMDSColorShapeCustomization) #Confirm sorting detach(NMDSColorShapeCustomization) colnames(NMDSColorShapeCustomization) <- c("Sample.Number", "Site", "Eelgrass.Condition", "Region", "Region.Shape", "Color", "Shape") #Change column names head(NMDSColorShapeCustomization) #Confirm changes tail(NMDSColorShapeCustomization) #Confirm changes ``` ### All proteins and predictors ```{r} plot(protEnvRDA, choices=c(1,2), type = 'none', scaling = 2) #Create an empty plot based on RDA dimensions points(protEnvRDA, choices = c(1,2), display = 'wa', pch = NMDSColorShapeCustomization$Shape, cex = 1, scaling = 2, col = NMDSColorShapeCustomization$Color) #Plot objects as points points(protEnvRDA, choices = c(1,2), display = 'sp', pch = 22, col = 'grey20', bg = "grey20", cex = 0.5, scaling = 2) #Plot descriptors as text text(protEnvRDA, choices = c(1,2), display = 'bp', col = 'grey20', cex = 0.75) #Plot only marginally significant predictors ``` ### Only significant proteins and predictors ```{r} plot(protEnvRDA, choices=c(1,2), type = 'none', scaling = 2, xlab = "", ylab = "", xaxt = "n", yaxt = "n") #Create an empty plot based on RDA dimensions points(protEnvRDA, choices = c(1,2), display = 'wa', pch = NMDSColorShapeCustomization$Shape, cex = 1, scaling = 2, col = NMDSColorShapeCustomization$Color) #Plot objects as points points(protEnvRDA, choices = c(1,2), display = 'sp', col = 'grey20', pch = 4, cex = 1, scaling = 2, select = c(5, 12, 13, 19, 22, 30, 33)) #Plot significant proteins text(protEnvRDA, choices = c(1,2), display = 'bp', col = 'grey20', cex = 0.75, select = 1:2) #Plot only marginally significant predictors axis(side = 1, labels = TRUE, col = "grey80", cex.axis = 0.75) mtext(side = 1, text = "RDA1", line = 2) axis(side = 2, labels = TRUE, col = "grey80", cex.axis = 0.75) mtext(side = 2, text = "RDA2", line = 2) box(col = "grey80") legend("topleft", pch = c(rep(x = 1, times = 6), 16), legend=c('Case Inlet', "Fidalgo Bay", "Willapa Bay", "Skokomish", "Port Gamble", "Bare", "Eelgrass"), col=c('#00A9BD', '#38001C', '#440D82', '#017A74', '#EB8B0C', 'black', 'black'), cex = 0.5, bty = "n") ```