#In this script, I'll use an NMDS plot to see if my technical replicates are similar. #### IMPORT DATA #### SRMAreas <- read.csv("2017-09-12-Gigas-SRM-ReplicatesOnly-PostDilutionCurve-NoPivot-RevisedSettings-Report.csv", na.strings = "#N/A") #Specify Skyline's special way of designating N/A values head(SRMAreas) #Confirm import tail(SRMAreas) #Confirm import #### CREATE A MASTER DATAFRAME #### #I want to merge my Skyline data with sample names, sites, and eelgrass condition to create a master dataframe will all possible information sequenceFile <- read.csv("2017-07-28-SRM-Samples-Sequence-File.csv", na.strings = "N/A") # Import sequence file head(sequenceFile) #Confirm import sequenceFile <- sequenceFile[,c(2,3,8)] #Keep the Replicate.Name, Comment and TIC columns names(sequenceFile) <- c("Replicate.Name", "Sample.Number", "TIC") head(sequenceFile) #Confirm change masterSRMData <- merge(x = SRMAreas, y = sequenceFile, by = "Replicate.Name") #Merge the sample names and replicate names to use for analysis. head(masterSRMData) #Confirm merge tail(masterSRMData) #Confirm merge biologicalReplicates <- read.csv("2017-09-06-Biological-Replicate-Information.csv", na.strings = "N/A", fileEncoding="UTF-8-BOM") #Import site and eelgrass condition information (i.e. biological replicate information), using specific file encoding information head(biologicalReplicates) #Confirm import tail(biologicalReplicates) #Confirm import masterSRMDataBiologicalReplicates <- merge(x = masterSRMData, y = biologicalReplicates, by = "Sample.Number") #Add biological replicate information to master list. head(masterSRMDataBiologicalReplicates) #Confirm change #write.csv(x = masterSRMDataBiologicalReplicates, file = "2017-09-07-Master-SRM-Data-BiologicalReplicates-NoBlanks-NoPivot.csv") #Write out master dataframe #### SUBSET DATA FOR NMDS PLOT #### #For the NMDS, I want only the protein/peptide/transition information and peak area SRMDataNMDS <- masterSRMDataBiologicalReplicates #Duplicate master list into a new dataframe head(SRMDataNMDS) #Confirm copy tail(SRMDataNMDS) #Confirm copy SRMDataNMDS <- SRMDataNMDS[,-c(2, 5, 7, 10, 11)] #Remove extraneous columns: Replicate.Name, Transition, Peptide.Retention.Time, Site, Eelgrass head(SRMDataNMDS) #Confirm column removal SRMDataNMDS <- SRMDataNMDS[! SRMDataNMDS$Protein.Name %in% "PRTC peptides", ] #Remove PRTC peptide data head(SRMDataNMDS) #Confirm removal transform(SRMDataNMDS, Area = as.numeric(Area)) #Make sure Area is recognized as a numeric variable is.numeric(SRMDataNMDS$Area) #Confirm change transform(SRMDataNMDS, TIC = as.numeric(TIC)) #Make sure TIC is recognized as a numeric variable is.numeric(SRMDataNMDS$TIC) #Confirm change #### MAKE NMDS WITHOUT NORMALIZING #### #The goal is to have the row names of my new dataframe be Protein/Peptides/Transitions, with the column names as the sample number SRMDataNonNormalizedNMDS <- SRMDataNMDS #Create a duplicate dataframe SRMDataNonNormalizedNMDS <- SRMDataNMDS[,-6] #Remove TIC column head(SRMDataNonNormalizedNMDS) #Confirm creation #My first step is to change my dataframe from long to wide (i.e. cast it) library(reshape2) #Instal package to pivot table SRMDataNMDSNonNormalizedPivoted <- dcast(SRMDataNonNormalizedNMDS, Protein.Name + Peptide.Sequence + Fragment.Ion ~ Sample.Number) #Cast table! Protein/Peptides/Transitions remain as columns with Sample Number as column headers. Normalized.Area used as value column by default. head(SRMDataNMDSNonNormalizedPivoted) #Confirm cast. SRMDataNMDSNonNormalizedPivoted$RowNames <- paste(SRMDataNMDSNonNormalizedPivoted$Protein.Name, SRMDataNMDSNonNormalizedPivoted$Peptide.Sequence, SRMDataNMDSNonNormalizedPivoted$Fragment.Ion) #Merge Protein, Peptide and Transition information into one column head(SRMDataNMDSNonNormalizedPivoted) #Confirm column merge SRMDataNMDSNonNormalizedPivoted <- SRMDataNMDSNonNormalizedPivoted[,-c(1:3)] #Remove unmerged columns head(SRMDataNMDSNonNormalizedPivoted) #Confirm column removal #write.csv(SRMDataNMDSNonNormalizedPivoted, file = "2017-09-07-SRM-Data-NMDS-Pivoted.csv") #Wrote out as .csv to make future analyses easier. #Now I can make an NMDS plot #Load the source file for the biostats package source("biostats.R") #Either load the source R script or copy paste install.packages("vegan") #Install vegan package library(vegan) SRMDataNMDSNonNormalizedPivotedCorrected <- SRMDataNMDSNonNormalizedPivoted #Duplicate dataframe SRMDataNMDSNonNormalizedPivotedCorrected[is.na(SRMDataNMDSNonNormalizedPivotedCorrected)] <- 0 #Replace NAs with 0s head(SRMDataNMDSNonNormalizedPivotedCorrected) #Confirm there are no NAs area.protID <- SRMDataNMDSNonNormalizedPivotedCorrected[-93] #Save all area data as a new dataframe rownames(area.protID) <- SRMDataNMDSNonNormalizedPivotedCorrected[,93] #Make sure last column of protein names is recognized as row names instead of values head(area.protID) #Confirm changes area.t <- t(area.protID) #Transpose the file so that rows and columns are switched head(area.t) #Confirm transposition area.tra <- (area.t+1) #Add 1 to all values before transforming area.tra <- data.trans(area.tra, method = 'log', plot = FALSE) #log(x+1) transformation proc.nmds.nonnorm.euclidean <- metaMDS(area.t, distance = 'euclidean', k = 2, trymax = 10000, autotransform = FALSE) #Make MDS dissimilarity matrix using euclidean distance. Julian confirmed that I should use euclidean distances, and not bray-curtis #stressplot(proc.nmds.nonnorm.euclidean) #Make Shepard plot #ordiplot(proc.nmds.nonnorm.euclidean) #Plot basic NMDS #vec.proc.nmds.nonnorm.euclidean <- envfit(proc.nmds.nonnorm.euclidean$points, area.t, perm = 1000) #Calculate loadings ordiplot(proc.nmds.nonnorm.euclidean, choices = c(1,2), type = "text", display = "sites") #Plot refined NMDS displaying only samples with their names #plot(vec.proc.nmds.euclidean, p.max=.01, col='blue') #Plot eigenvectors #proc.nmds.nonnorm.euclidean.log <- metaMDS(area.tra, distance = 'euclidean', k = 2, trymax = 10000, autotransform = FALSE) #Make MDS dissimilarity matrix using euclidean distance #stressplot(proc.nmds.nonnorm.euclidean.log) #Make Shepard plot #ordiplot(proc.nmds.nonnorm.euclidean.log) #Plot basic NMDS #ordiplot(proc.nmds.nonnorm.euclidean.log, choices = c(1,2), type = "text", display = "sites") #Plot refined NMDS displaying only samples with their names #proc.nmds.nonnorm.euclidean.autotransform <- metaMDS(area.t, distance = 'euclidean', k = 2, trymax = 10000, autotransform = TRUE) #Make MDS dissimilarity matrix using euclidean distance and autotransformation #stressplot(proc.nmds.nonnorm.euclidean.autotransform) #Make Shepard plot #ordiplot(proc.nmds.nonnorm.euclidean.autotransform) #Plot basic NMDS #ordiplot(proc.nmds.nonnorm.euclidean.autotransform, choices = c(1,2), type = "text", display = "sites") #Plot refined NMDS displaying only samples with their names #jpeg(filename = "2017-09-11-NMDS-TechnicalReplication-NonNormalized.jpeg", width = 1000, height = 1000) #ordiplot(proc.nmds.nonnorm.euclidean, choices = c(1,2), type = "text", display = "sites") #Plot refined NMDS displaying only samples with their names #dev.off() #Only the euclidean non-transformed version gives me an NMDS plot, but there looks like there's variation that should be accounted for by normalizing. #### NORMALIZE BY TIC VALUES #### SRMNormalizedDataNMDS <- SRMDataNMDS #Duplicate dataframe SRMNormalizedDataNMDS$Normalized.Area <- SRMNormalizedDataNMDS$Area/SRMDataNMDS$TIC #Divide areas by corresponding TIC values head(SRMNormalizedDataNMDS) #Confirm division SRMNormalizedDataNMDS <- SRMNormalizedDataNMDS[,-c(5,6)] #Remove nonnormalized area and TIC columns head(SRMNormalizedDataNMDS) #Confirm column removal #### REFORMAT DATAFRAME FOR NMDS #### #The goal is to have the row names of my new dataframe be Protein/Peptides/Transitions, with the column names as the sample number #My first step is to change my dataframe from long to wide (i.e. cast it) library(reshape2) #Instal package to pivot table SRMDataNMDSPivoted <- dcast(SRMNormalizedDataNMDS, Protein.Name + Peptide.Sequence + Fragment.Ion ~ Sample.Number) #Cast table! Protein/Peptides/Transitions remain as columns with Sample Number as column headers. Normalized.Area used as value column by default. head(SRMDataNMDSPivoted) #Confirm cast. SRMDataNMDSPivoted$RowNames <- paste(SRMDataNMDSPivoted$Protein.Name, SRMDataNMDSPivoted$Peptide.Sequence, SRMDataNMDSPivoted$Fragment.Ion) #Merge Protein, Peptide and Transition information into one column head(SRMDataNMDSPivoted) #Confirm column merge SRMDataNMDSPivoted <- SRMDataNMDSPivoted[,-c(1:3)] #Remove unmerged columns head(SRMDataNMDSPivoted) #Confirm column removal #write.csv(SRMDataNMDSPivoted, file = "2017-09-11-SRM-Data-Normalized-NMDS-Pivoted.csv") #Wrote out as .csv to make future analyses easier. #### NMDS PLOT #### #Load the source file for the biostats package source("biostats.R") #Either load the source R script or copy paste. install.packages("vegan") #Install vegan package library(vegan) SRMDataNMDSPivotedCorrected <- SRMDataNMDSPivoted #Duplicate dataframe SRMDataNMDSPivotedCorrected[is.na(SRMDataNMDSPivotedCorrected)] <- 0 #Replace NAs with 0s head(SRMDataNMDSPivotedCorrected) #Confirm there are no NAs area.protID2 <- SRMDataNMDSPivotedCorrected[-93] #Save all area data as a new dataframe rownames(area.protID2) <- SRMDataNMDSPivotedCorrected[,93] #Make sure last column of protein names is recognized as row names instead of values head(area.protID2) #Confirm changes area2.t <- t(area.protID2) #Transpose the file so that rows and columns are switched head(area2.t) #Confirm transposition area2.tra <- (area2.t+1) #Add 1 to all values before transforming area2.tra <- data.trans(area2.tra, method = 'log', plot = FALSE) #log(x+1) transformation proc.nmds.euclidean <- metaMDS(area2.t, distance = 'euclidean', k = 2, trymax = 10000, autotransform = FALSE) #Make MDS dissimilarity matrix using euclidean distance. Julian confirmed that I should use euclidean distances, and not bray-curtis stressplot(proc.nmds.euclidean) #Make Shepard plot ordiplot(proc.nmds.euclidean) #Plot basic NMDS vec.proc.nmds.euclidean <- envfit(proc.nmds.euclidean$points, area2.t, perm = 1000) #Calculate loadings ordiplot(proc.nmds.euclidean, choices = c(1,2), type = "text", display = "sites") #Plot refined NMDS displaying only samples with their names plot(vec.proc.nmds.euclidean, p.max=.01, col='blue') #Plot eigenvectors proc.nmds.euclidean.log <- metaMDS(area2.tra, distance = 'euclidean', k = 2, trymax = 10000, autotransform = FALSE) #Make MDS dissimilarity matrix using euclidean distance #stressplot(proc.nmds.euclidean.log) #Make Shepard plot #ordiplot(proc.nmds.euclidean.log) #Plot basic NMDS ordiplot(proc.nmds.euclidean.log, choices = c(1,2), type = "text", display = "sites") #Plot refined NMDS displaying only samples with their names proc.nmds.euclidean.autotransform <- metaMDS(area2.t, distance = 'euclidean', k = 2, trymax = 10000, autotransform = TRUE) #Make MDS dissimilarity matrix using euclidean distance and autotransformation #stressplot(proc.nmds.euclidean.autotransform) #Make Shepard plot #ordiplot(proc.nmds.euclidean.autotransform) #Plot basic NMDS ordiplot(proc.nmds.euclidean.autotransform, choices = c(1,2), type = "text", display = "sites") #Plot refined NMDS displaying only samples with their names #jpeg(filename = "2017-09-08-NMDS-TechnicalReplication-Normalized.jpeg", width = 1000, height = 1000) #ordiplot(proc.nmds.euclidean, choices = c(1,2), type = "text", display = "sites") #Plot refined NMDS displaying only samples with their names #dev.off() #### CALCULATE DISTANCES BETWEEN TECHNICAL REPLICATE ORDINATIONS #### NMDSCoordinates <- proc.nmds.euclidean$points #Save NMDS coordinates of each point in a new dataframe head(NMDSCoordinates) #Confirm dataframe creation nSamples <- length(NMDSCoordinates)/2 #Calculate the number of samples sampleDistances <- vector(length = nSamples) #Create an empty vector to store distance values for(i in 1:nSamples) { #For rows in NMDSCoordinates sampleDistances[i] <- sqrt((NMDSCoordinates[i,1]-NMDSCoordinates[i,2])^2 + (NMDSCoordinates[i+1,1]-NMDSCoordinates[i+1,2])^2) #Calculate distance between ordinations print(sampleDistances[i]) #Print the distance value } sampleDistances #Confirm vector creation. This vector has all consecutive pairs, including those that are not paris of technical replicates. I need to retain just the odd numbered rows. technicalReplicates <- rownames(NMDSCoordinates) #Save rownames as a new vector technicalReplicates #Confirm vector creation technicalReplicateDistances <- data.frame(Sample = technicalReplicates[seq(from = 1, to = nSamples, by = 2)], Distance = sampleDistances[seq(from = 1, to = nSamples, by = 2)]) #Create a new dataframe with just odd numbered row distances (technical replicate pairs) head(technicalReplicateDistances) #Confirm dataframe creation tail(technicalReplicateDistances) #Confirm dataframe creation #### PLOT DISTANCES BETWEEN TECHNICAL REPLICATE ORDINATIONS #### #jpeg(filename = "2017-09-08-NMDS-TechnicalReplication-Ordination-Distances.jpeg", width = 1000, height = 1000) plot(x = technicalReplicateDistances$Sample, y = technicalReplicateDistances$Distance, type = "line", xlab = "Sample", ylab = "Distance between Ordinations") #dev.off()