#In this script, I'll test the methylKit pipeline with 2 files Steven generated. This will allow me to figure out why I cannot unite my files. If I can unite Steven's files, then I know there is an issue with the files I generated. If I cannot unite Steven's files, then it could be a software issue.

#### SET WORKING DIRECTORY ####
getwd()
setwd("2018-05-09-Steven-Test-Files/") #Set working directory as test file folder

#### INSTALL PACKAGES ####
install.packages("devtools") #Install the devtools package
library(devtools) #Load devtools

source("https://bioconductor.org/biocLite.R") #Source package from bioconductor
biocLite("methylKit") #Install methylkit
install_github("al2na/methylKit", build_vignettes = FALSE, repos=BiocInstaller::biocinstallRepos(), dependencies = TRUE) #Install more methylKit options
library(methylKit) #Load methylkit

#### PROCESS METHYLATION DATA ####

analysisFiles <- list ("zr2096_1_dedup.sorted.bam",
                       "zr2096_10_dedup.sorted.bam") #Put all .bam files into a list for analysis
sample.IDs <- list("1", "10") #Create list of sample IDs
treatmentSpecification <- c(0, 1)
processedFiles <- processBismarkAln(location = analysisFiles, sample.id = sample.IDs, assembly = "v3", read.context = "CpG", mincov = 1, treatment = treatmentSpecification) #Process files for CpG meetehylation. First file is from ambient conditions, second from treatment.

#### ANALYZE METHYLATION DATA ####

nFiles <- length(sample.IDs) #Count number of samples
fileName <- data.frame("nameBase" = rep("2018-05-08-Percent-CpG-Methylation", times = nFiles),
                       "nameBase2" = rep("2018-05-08-Percent-CpG-Coverage", times = nFiles),
                       "sample.ID" = 1:10) #Create new dataframe for filenames
head(fileName) #Confirm dataframe creation
fileName$actualFileName <- paste(fileName$nameBase, "-Sample", fileName$sample.ID, ".jpeg", sep = "") #Create a new column for the full filename
fileName$actualFileName2 <- paste(fileName$nameBase2, "-Sample", fileName$sample.ID, ".jpeg", sep = "") #Create a new column for the full filename
head(fileName) #Confirm column creation

for(i in 1:nFiles) { #For each data file
  jpeg(filename = fileName$actualFileName[i], height = 1000, width = 1000) #Save file with designated name
  getMethylationStats(processedFiles[[i]], plot = TRUE, both.strands = FALSE) #Get %CpG methylation information
  dev.off() #Turn off plotting device
} #Plot and save %CpG methylation information

for(i in 1:nFiles) { #For each data file
  jpeg(filename = fileName$actualFileName2[i], height = 1000, width = 1000) #Save file with designated name
  getCoverageStats(processedFiles[[i]], plot = TRUE, both.strands = FALSE) #Get CpG coverage information
  dev.off() #Turn off plotting device
} #Plot and save CpG coverage information

methylationInformation <- unite(processedFiles) #Combine all processed files into a single table

getCorrelation(methylationInformation, plot = TRUE) #Understand correlation between methylation patterns in different samples
clusterSamples(methylationInformation, dist = "correlation", method = "ward", plot = TRUE) #Cluster samples based on correlation coefficients
clusteringInformation <- clusterSamples(methylationInformation, dist = "correlation", method = "ward", plot = FALSE) #Save cluster information as a new object

PCASamples(methylationInformation) #Run a PCA analysis on percent methylation for all samples
PCASamples(methylationInformation, screeplot = TRUE) #Run the PCA analysis and plot variances against PC number in a screeplot

differentialMethylationStats <- calculateDiffMeth(methylationInformation) #Calculate differential methylation statistics based on treatment indication from processBismarkAln
diffMethStats25 <- getMethylDiff(differentialMethylationStats, difference = 25, qvalue = 0.01) #Identify loci that are at least 25% different. Q-value is the FDR used for p-value corrections.
diffMethStats50 <- getMethylDiff(differentialMethylationStats, difference=50, qvalue=0.01) #Identify loci that are at least 50% different

#I couldn't get any methylation statistics because I only had 2 files, but I was able to run all of the commands! There are two explanations for why I couldn't get my samples to run through this script. One, I didn't use any trimmed data, while Steven did. Two, my subset data used the first 10,000 reads, while Steven's used the first 1,000,000.