#In this script, I'll use an NMDS plot to assess technical replication of my PRTC peptides, which is something Emma was interested in. I'll do this with both normalized and nonnormalized data.

#### SET WORKING DIRECTORY ####
setwd(dir = "../..") #Set my working directory as main SRM analysis folder, and not the subdirectory where the R script is located
getwd() #Confirm change

#### IMPORT DATA ####

SRMAreas <- read.csv("2017-09-12-Gigas-SRM-ReplicatesOnly-PostDilutionCurve-NoPivot-RevisedSettings-Report.csv", na.strings = "#N/A") #Specify Skyline's special way of designating N/A values
head(SRMAreas) #Confirm import
tail(SRMAreas) #Confirm import

#### CREATE A MASTER DATAFRAME ####

#I want to merge my Skyline data with sample names, sites, and eelgrass condition to create a master dataframe will all possible information

sequenceFile <- read.csv("2017-07-28-SRM-Samples-Sequence-File.csv", na.strings = "N/A") # Import sequence file
head(sequenceFile) #Confirm import
sequenceFile <- sequenceFile[,c(2,3,8)] #Keep the Replicate.Name, Comment and TIC columns
names(sequenceFile) <- c("Replicate.Name", "Sample.Number", "TIC")
head(sequenceFile) #Confirm change
masterSRMData <- merge(x = SRMAreas, y = sequenceFile, by = "Replicate.Name") #Merge the sample names and replicate names to use for analysis.
head(masterSRMData) #Confirm merge
tail(masterSRMData) #Confirm merge

biologicalReplicates <- read.csv("2017-09-06-Biological-Replicate-Information.csv", na.strings = "N/A", fileEncoding="UTF-8-BOM") #Import site and eelgrass condition information (i.e. biological replicate information), using specific file encoding information
head(biologicalReplicates) #Confirm import
tail(biologicalReplicates) #Confirm import
masterSRMDataBiologicalReplicates <- merge(x = masterSRMData, y = biologicalReplicates, by = "Sample.Number") #Add biological replicate information to master list.
head(masterSRMDataBiologicalReplicates) #Confirm change

#### SUBSET DATA FOR NMDS PLOT ####

#For the NMDS, I want only the protein/peptide/transition information and peak area for PRTC peptides

SRMDataNMDS <- masterSRMDataBiologicalReplicates #Duplicate master list into a new dataframe
head(SRMDataNMDS) #Confirm copy
tail(SRMDataNMDS) #Confirm copy
SRMDataNMDS <- SRMDataNMDS[,-c(2, 5, 7, 10, 11)] #Remove extraneous columns: Replicate.Name, Transition, Peptide.Retention.Time, Site, Eelgrass
head(SRMDataNMDS) #Confirm column removal
SRMDataNMDS <- SRMDataNMDS[ SRMDataNMDS$Protein.Name %in% "PRTC peptides", ] #Keep only PRTC peptide data
head(SRMDataNMDS) #Confirm removal
transform(SRMDataNMDS, Area = as.numeric(Area)) #Make sure Area is recognized as a numeric variable
is.numeric(SRMDataNMDS$Area) #Confirm change
transform(SRMDataNMDS, TIC = as.numeric(TIC)) #Make sure TIC is recognized as a numeric variable
is.numeric(SRMDataNMDS$TIC) #Confirm change

#### MAKE NMDS WITHOUT NORMALIZING ####

#The goal is to have the row names of my new dataframe be Protein/Peptides/Transitions, with the column names as the sample number

SRMDataNonNormalizedNMDS <- SRMDataNMDS #Create a duplicate dataframe
SRMDataNonNormalizedNMDS <- SRMDataNMDS[,-6] #Remove TIC column
head(SRMDataNonNormalizedNMDS) #Confirm creation

#My first step is to change my dataframe from long to wide (i.e. cast it)
library(reshape2) #Instal package to pivot table
SRMDataNMDSNonNormalizedPivoted <- dcast(SRMDataNonNormalizedNMDS, Protein.Name + Peptide.Sequence + Fragment.Ion ~ Sample.Number) #Cast table! Protein/Peptides/Transitions remain as columns with Sample Number as column headers. Normalized.Area used as value column by default.
head(SRMDataNMDSNonNormalizedPivoted) #Confirm cast.
SRMDataNMDSNonNormalizedPivoted$RowNames <- paste(SRMDataNMDSNonNormalizedPivoted$Peptide.Sequence, SRMDataNMDSNonNormalizedPivoted$Fragment.Ion) #Merge Protein, Peptide and Transition information into one column
head(SRMDataNMDSNonNormalizedPivoted) #Confirm column merge
SRMDataNMDSNonNormalizedPivoted <- SRMDataNMDSNonNormalizedPivoted[,-c(1:3)] #Remove unmerged columns
head(SRMDataNMDSNonNormalizedPivoted) #Confirm column removal

#Now I can make an NMDS plot

#Load the source file for the biostats package
source("biostats.R") #Either load the source R script or copy paste
install.packages("vegan") #Install vegan package
library(vegan)

SRMDataNMDSNonNormalizedPivotedCorrected <- SRMDataNMDSNonNormalizedPivoted #Duplicate dataframe
SRMDataNMDSNonNormalizedPivotedCorrected[is.na(SRMDataNMDSNonNormalizedPivotedCorrected)] <- 0 #Replace NAs with 0s
head(SRMDataNMDSNonNormalizedPivotedCorrected) #Confirm there are no NAs

area.protID <- SRMDataNMDSNonNormalizedPivotedCorrected[-93] #Save all area data as a new dataframe
rownames(area.protID) <- SRMDataNMDSNonNormalizedPivotedCorrected[,93] #Make sure last column of protein names is recognized as row names instead of values
head(area.protID) #Confirm changes

area.t <- t(area.protID) #Transpose the file so that rows and columns are switched
head(area.t) #Confirm transposition
area.tra <- (area.t+1) #Add 1 to all values before transforming
area.tra <- data.trans(area.tra, method = 'log', plot = FALSE) #log(x+1) transformation

#proc.nmds.nonnorm.euclidean <- metaMDS(area.t, distance = 'bray', k = 2, trymax = 10000, autotransform = FALSE) #Make MDS dissimilarity matrix using euclidean distance. Julian confirmed that I should use euclidean distances, and not bray-curtis
#stressplot(proc.nmds.nonnorm.euclidean) #Make Shepard plot
#ordiplot(proc.nmds.nonnorm.euclidean) #Plot basic NMDS
#vec.proc.nmds.nonnorm.euclidean <- envfit(proc.nmds.nonnorm.euclidean$points, area.t, perm = 1000) #Calculate loadings
#ordiplot(proc.nmds.nonnorm.euclidean, choices = c(1,2), type = "text", display = "sites") #Plot refined NMDS displaying only samples with their names
#plot(vec.proc.nmds.euclidean, p.max=.01, col='blue') #Plot eigenvectors

#Couldn't use nontransformed data: Error in if (any(w < 0) || sum(w) == 0) stop("weights must be non-negative and not all zero") : missing value where TRUE/FALSE needed

proc.nmds.nonnorm.euclidean.log <- metaMDS(area.tra, distance = 'euclidean', k = 2, trymax = 10000, autotransform = FALSE) #Make MDS dissimilarity matrix using euclidean distance
#stressplot(proc.nmds.nonnorm.euclidean.log) #Make Shepard plot
#ordiplot(proc.nmds.nonnorm.euclidean.log) #Plot basic NMDS
ordiplot(proc.nmds.nonnorm.euclidean.log, choices = c(1,2), type = "text", display = "sites") #Plot refined NMDS displaying only samples with their names

proc.nmds.nonnorm.euclidean.autotransform <- metaMDS(area.t, distance = 'euclidean', k = 2, trymax = 10000, autotransform = TRUE) #Make MDS dissimilarity matrix using euclidean distance and autotransformation
#stressplot(proc.nmds.nonnorm.euclidean.autotransform) #Make Shepard plot
#ordiplot(proc.nmds.nonnorm.euclidean.autotransform) #Plot basic NMDS
ordiplot(proc.nmds.nonnorm.euclidean.autotransform, choices = c(1,2), type = "text", display = "sites") #Plot refined NMDS displaying only samples with their names

#I saved the two iterations where my data was transformed (log or autotransformation)

#jpeg(filename = "2017-10-10-Troubleshooting/2017-10-24-PRTC-NMDS/2017-10-24-NMDS-TechnicalReplication-NonNormalized-LogTransformation.jpeg", width = 1000, height = 1000)
#ordiplot(proc.nmds.nonnorm.euclidean.log, choices = c(1,2), type = "text", display = "sites") #Plot refined NMDS displaying only samples with their names
#dev.off()

#jpeg(filename = "2017-10-10-Troubleshooting/2017-10-24-PRTC-NMDS/2017-10-24-NMDS-TechnicalReplication-NonNormalized-Autotransformation.jpeg", width = 1000, height = 1000)
#ordiplot(proc.nmds.nonnorm.euclidean.autotransform, choices = c(1,2), type = "text", display = "sites") #Plot refined NMDS displaying only samples with their names
#dev.off()

#### NORMALIZE BY TIC VALUES ####

SRMNormalizedDataNMDS <- SRMDataNMDS #Duplicate dataframe
SRMNormalizedDataNMDS$Normalized.Area <- SRMNormalizedDataNMDS$Area/SRMDataNMDS$TIC #Divide areas by corresponding TIC values
head(SRMNormalizedDataNMDS) #Confirm division
SRMNormalizedDataNMDS <- SRMNormalizedDataNMDS[,-c(5,6)] #Remove nonnormalized area and TIC columns
head(SRMNormalizedDataNMDS) #Confirm column removal

#### REFORMAT DATAFRAME FOR NMDS ####

SRMDataNMDSPivoted <- dcast(SRMNormalizedDataNMDS, Protein.Name + Peptide.Sequence + Fragment.Ion ~ Sample.Number) #Cast table! Protein/Peptides/Transitions remain as columns with Sample Number as column headers. Normalized.Area used as value column by default.
head(SRMDataNMDSPivoted) #Confirm cast.
SRMDataNMDSPivoted$RowNames <- paste(SRMDataNMDSPivoted$Protein.Name, SRMDataNMDSPivoted$Peptide.Sequence, SRMDataNMDSPivoted$Fragment.Ion) #Merge Protein, Peptide and Transition information into one column
head(SRMDataNMDSPivoted) #Confirm column merge
SRMDataNMDSPivoted <- SRMDataNMDSPivoted[,-c(1:3)] #Remove unmerged columns
head(SRMDataNMDSPivoted) #Confirm column removal

#### NMDS PLOT ####

SRMDataNMDSPivotedCorrected <- SRMDataNMDSPivoted #Duplicate dataframe
SRMDataNMDSPivotedCorrected[is.na(SRMDataNMDSPivotedCorrected)] <- 0 #Replace NAs with 0s
head(SRMDataNMDSPivotedCorrected) #Confirm there are no NAs

area.protID2 <- SRMDataNMDSPivotedCorrected[-93] #Save all area data as a new dataframe
rownames(area.protID2) <- SRMDataNMDSPivotedCorrected[,93] #Make sure last column of protein names is recognized as row names instead of values
head(area.protID2) #Confirm changes

area2.t <- t(area.protID2) #Transpose the file so that rows and columns are switched
head(area2.t) #Confirm transposition
area2.tra <- (area2.t+1) #Add 1 to all values before transforming
area2.tra <- data.trans(area2.tra, method = 'log', plot = FALSE) #log(x+1) transformation

proc.nmds.euclidean <- metaMDS(area2.t, distance = 'euclidean', k = 2, trymax = 10000, autotransform = FALSE) #Make MDS dissimilarity matrix using euclidean distance. Julian confirmed that I should use euclidean distances, and not bray-curtis
#stressplot(proc.nmds.euclidean) #Make Shepard plot
#ordiplot(proc.nmds.euclidean) #Plot basic NMDS
#vec.proc.nmds.euclidean <- envfit(proc.nmds.euclidean$points, area2.t, perm = 1000) #Calculate loadings
ordiplot(proc.nmds.euclidean, choices = c(1,2), type = "text", display = "sites") #Plot refined NMDS displaying only samples with their names
#plot(vec.proc.nmds.euclidean, p.max=.01, col='blue') #Plot eigenvectors

proc.nmds.euclidean.log <- metaMDS(area2.tra, distance = 'euclidean', k = 2, trymax = 10000, autotransform = FALSE) #Make MDS dissimilarity matrix using euclidean distance
#stressplot(proc.nmds.euclidean.log) #Make Shepard plot
#ordiplot(proc.nmds.euclidean.log) #Plot basic NMDS
ordiplot(proc.nmds.euclidean.log, choices = c(1,2), type = "text", display = "sites") #Plot refined NMDS displaying only samples with their names

proc.nmds.euclidean.autotransform <- metaMDS(area2.t, distance = 'euclidean', k = 2, trymax = 10000, autotransform = TRUE) #Make MDS dissimilarity matrix using euclidean distance and autotransformation
#stressplot(proc.nmds.euclidean.autotransform) #Make Shepard plot
#ordiplot(proc.nmds.euclidean.autotransform) #Plot basic NMDS
ordiplot(proc.nmds.euclidean.autotransform, choices = c(1,2), type = "text", display = "sites") #Plot refined NMDS displaying only samples with their names

#The log transformed and no transformation plots looked similar. The autotransformed plot looked like the previous nonnormalized autotransformed plot. I'll save the log and autotransformed versions for comparison.

#jpeg(filename = "2017-10-10-Troubleshooting/2017-10-24-PRTC-NMDS/2017-10-24-NMDS-TechnicalReplication-Normalized-LogTransformation.jpeg", width = 1000, height = 1000)
#ordiplot(proc.nmds.euclidean.log, choices = c(1,2), type = "text", display = "sites") #Plot refined NMDS displaying only samples with their names
#dev.off()

#jpeg(filename = "2017-10-10-Troubleshooting/2017-10-24-PRTC-NMDS/2017-10-24-NMDS-TechnicalReplication-Normalized-Autotransformation.jpeg", width = 1000, height = 1000)
#ordiplot(proc.nmds.euclidean.autotransform, choices = c(1,2), type = "text", display = "sites") #Plot refined NMDS displaying only samples with their names
#dev.off()

#### CALCULATE DISTANCES BETWEEN TECHNICAL REPLICATE ORDINATIONS ####

NMDSCoordinates <- proc.nmds.euclidean.log$points #Save NMDS coordinates of each point in a new dataframe for the log transformed NMDS
head(NMDSCoordinates) #Confirm dataframe creation
nSamples <- length(NMDSCoordinates)/2 #Calculate the number of samples
sampleDistances <- vector(length = nSamples) #Create an empty vector to store distance values
for(i in 1:nSamples) { #For rows in NMDSCoordinates
  sampleDistances[i] <- sqrt((NMDSCoordinates[i,1]-NMDSCoordinates[i,2])^2 + (NMDSCoordinates[i+1,1]-NMDSCoordinates[i+1,2])^2) #Calculate distance between ordinations
  print(sampleDistances[i]) #Print the distance value
}
sampleDistances #Confirm vector creation. This vector has all consecutive pairs, including those that are not paris of technical replicates. I need to retain just the odd numbered rows.
technicalReplicates <- rownames(NMDSCoordinates) #Save rownames as a new vector
technicalReplicates #Confirm vector creation
technicalReplicateDistances <- data.frame(Sample = technicalReplicates[seq(from = 1, to = nSamples, by = 2)], 
                                          Distance = sampleDistances[seq(from = 1, to = nSamples, by = 2)]) #Create a new dataframe with just odd numbered row distances (technical replicate pairs)
head(technicalReplicateDistances) #Confirm dataframe creation
tail(technicalReplicateDistances) #Confirm dataframe creation

#### PLOT DISTANCES BETWEEN TECHNICAL REPLICATE ORDINATIONS ####

#jpeg(filename = "2017-10-10-Troubleshooting/2017-10-24-PRTC-NMDS/2017-10-24-NMDS-TechnicalReplication-Ordination-Distances-Normalized-LogTransformed.jpeg", width = 1000, height = 1000)
plot(x = technicalReplicateDistances$Sample, y = technicalReplicateDistances$Distance, type = "line", xlab = "Sample", ylab = "Distance between Ordinations")
#dev.off()