--- title: "WGBS" author: "HM Putnam" date: "2/5/2022" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # Load Libraries ```{r, warning=F, message=F} #install.packages( c("data.table","plyr","reshape2","ggplot2","gridBase","devtools")) #if (!requireNamespace("BiocManager", quietly=TRUE)) # install.packages("BiocManager") #BiocManager::install(c("GenomicRanges","rtracklayer","impute","Rsamtools")) #install_github("BIMSBbioinfo/genomation",build_vignettes=FALSE) #BiocManager::install("genomation") # install the packages #library('devtools') library('methylKit') library('genomation') library('GenomicRanges') library('tidyverse') library('plotrix') library('viridis') library('ggthemes') library("pheatmap") ``` #obtain genome and annotation files from NCBI ```{bash} wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/022/765/GCF_002022765.2_C_virginica-3.0/GCF_002022765.2_C_virginica-3.0_genomic.fna.gz > RAnalysis/data/GCF_002022765.2_C_virginica-3.0_genomic.fna.gz gunzip RAnalysis/data/GCF_002022765.2_C_virginica-3.0_genomic.fna.gz wget https://raw.githubusercontent.com/epigeneticstoocean/2018_L18-adult-methylation/main/genome-features/C_virginica-3.0_Gnomon_genes.bed > RAnalysis/data/C_virginica-3.0_Gnomon_genes.bed ``` ## Loading sample, genomic, and annotation information ```{r, message=F} #load sample information sample.info <- read.csv("RAnalysis/data/adult-meta.csv", header=T, sep=",", na.string="NA", stringsAsFactors = F) #read in info sample.info <- sample.info %>% filter(Sex=="F") #load genes gff (https://raw.githubusercontent.com/epigeneticstoocean/2018_L18-adult-methylation/main/genome-features/C_virginica-3.0_Gnomon_genes.bed) Genes <- read.csv("RAnalysis/data/C_virginica-3.0_Gnomon_genes.bed", header=F, sep="\t", na.string="NA", skip=0, stringsAsFactors = F) #read in data file Genes <- Genes[,c(1:4)] #select desired columns only colnames(Genes) <-c("scaffold", "start", "stop", "gene") #rename columns ``` genrate a file for Xcov of all positions found in all samples (e.g., 10X) ```{bash} multiIntersectBed -i \ RAnalysis/data/16F_R1_val_1_10x.tab \ RAnalysis/data/19F_R1_val_1_10x.tab \ RAnalysis/data/22F_R1_val_1_10x.tab \ RAnalysis/data/29F_R1_val_1_10x.tab \ RAnalysis/data/35F_R1_val_1_10x.tab \ RAnalysis/data/36F_R1_val_1_10x.tab \ RAnalysis/data/39F_R1_val_1_10x.tab \ RAnalysis/data/3F_R1_val_1_10x.tab \ RAnalysis/data/41F_R1_val_1_10x.tab \ RAnalysis/data/44F_R1_val_1_10x.tab \ RAnalysis/data/50F_R1_val_1_10x.tab \ RAnalysis/data/52F_R1_val_1_10x.tab \ RAnalysis/data/53F_R1_val_1_10x.tab \ RAnalysis/data/54F_R1_val_1_10x.tab \ RAnalysis/data/76F_R1_val_1_10x.tab \ RAnalysis/data/77F_R1_val_1_10x.tab > RAnalysis/data/meth.10x.bed ``` ```{bash} cat RAnalysis/data/meth.10x.bed | awk '$4 ==16' > RAnalysis/data/filtered.AllSamps.10x.bed ``` ```{bash} #Use intersectBed to find where loci and genes intersect, allowing loci to be mapped to annotated genes #wb: Print all lines in the second file #a: file that ends in posOnly #b: annotated gene list #Save output in a new file that has the same base name and ends in -Annotated.txt for f in RAnalysis/data/*_R1_val_1_10x.tab do intersectBed \ -wb \ -a ${f} \ -b RAnalysis/data/C_virginica-3.0_Gnomon_genes.bed \ > ${f}_gene done ``` ```{bash} #intersect with file to subset only those positions found in all samples for f in RAnalysis/data/*_gene do intersectBed \ -a ${f} \ -b RAnalysis/data/filtered.AllSamps.10x.bed \ > ${f}_AllSamps_gene.bed done wc -l RAnalysis/data/*AllSamps_gene.bed ``` ```{r} meth.data <- list.files(path = "RAnalysis/data/", pattern = "AllSamps_gene.bed$", full.names=TRUE) %>% set_names(.) %>% map_dfr(read.csv,.id="Sample.ID", header=FALSE, sep="\t", na.string="NA", stringsAsFactors = FALSE) %>% dplyr::select(c(Sample.ID:V2, V4:V6,V10)) %>% group_by(Sample.ID) colnames(meth.data) <- c("Sample.ID", "scaffold", "position","per.meth","meth","unmeth","gene") meth.data$Sample.ID <- gsub("RAnalysis/data//","",meth.data$Sample.ID) #remove extra characters meth.data$Sample.ID <- gsub("_R1_val_1_10x.tab_gene_AllSamps_gene.bed","",meth.data$Sample.ID) #remove extra characters meth.data.x <- merge(meth.data, sample.info, by="Sample.ID") #plotting allgene <- meth.data.x %>% group_by(gene,Sample.ID, Phenotype, Month, Coral.ID, Pair) %>% summarise(mean = mean(per.meth), sem = std.error(per.meth)) %>% mutate(Month = factor(Month, levels=c("July", "December"))) %>% mutate(Coral.ID = factor(Coral.ID, levels=c("M-11", "M-19","M-201", "M-203", "M-209", "M-211", "M-217", "M-219", "M-221", "M-3", "M-12", "M-20", "M-202", "M-204", "M-210", "M-212", "M-218", "M-220", "M-222", "M-4"))) ggplot(allgene, aes(x=Month, y=Phenotype, fill=mean))+ geom_tile(color="white", size=0.01)+ scale_fill_viridis(direction = 1,name="Percent Methylation")+ #facet_wrap(~Pair, ncol=2)+ theme_tufte(base_family="Helvetica") ggplot(allgene, aes(x=Month, y=Coral.ID, fill=mean))+ geom_tile(color="white", size=0.01)+ scale_fill_viridis(direction = 1,name="Percent Methylation")+ facet_wrap(~Phenotype, ncol=2)+ theme_tufte(base_family="Helvetica") #Plotting reaction norm meangenes <- meth.data.x %>% group_by(,Phenotype, Month) %>% summarise(mean = mean(per.meth), sem = std.error(per.meth)) %>% mutate(Month = factor(Month, levels=c("July", "December"))) meangenes %>% ggplot(aes(x=Month, y=mean, group=Phenotype, colour=Phenotype, shape=Phenotype)) + #plot data geom_line(size=0.7, position=position_dodge(.1)) + #plot lines scale_colour_manual(values = c("grey", "black")) + #set line color geom_point(size=4, position=position_dodge(.1), colour="black") + #set point characteristics scale_shape_manual(values=c(1,16)) + #set shapes geom_errorbar(aes(ymin=mean-sem, ymax=mean+sem), #plot error bars width=0, position=position_dodge(.1), colour="black") + #set error bar characteristics ylim(12.5,14.5)+ xlab("Secondary Treatment") + #plot x axis label ylab("Percent Methylation") + #plot y axis label theme_bw() ``` Testing for DMGs ```{r} # Binomial GLM to test for differentially methylated genes sub_meth_table <- meth.data.x sub_meth_table$grouped.gene <- paste0(sub_meth_table$Sample.ID, sub_meth_table$gene) sub_meth_table$factors <- paste0(sub_meth_table$Phenotype, sub_meth_table$Month) #filter for genes with >5 methylated positions min.filt <- count(sub_meth_table, vars = c(grouped.gene)) newdata <- min.filt[ which(min.filt$n > 4), ] sub_meth_table <- sub_meth_table[sub_meth_table$grouped.gene %in% newdata$vars,] # create data frame to store results results <- data.frame() gs <- unique(sub_meth_table$gene) #first subset the unique dataframes and second run the GLMs for(i in 1:length(sub_meth_table$gene)){ #subset the dataframe gene by gene sub_meth_table1 <- subset(sub_meth_table, gene ==gs[i]) # fit glm position model fit <- glm(matrix(c(meth, unmeth), ncol=2) ~ Phenotype * Month, data=sub_meth_table1, family=binomial) a <- anova(fit, test="LRT") # capture summary stats to data frame df <- data.frame(gene = sub_meth_table1[1,8], pval.treatment1 = a$`Pr(>Chi)`[2], pval.treatment2 = a$`Pr(>Chi)`[3], #position = a$`Pr(>Chi)`[2], pval.treatment1_x_treatment2 = a$`Pr(>Chi)`[4], #pval.treatment1_x_position = a$`Pr(>Chi)`[2], #pval.treatment2_x_position = a$`Pr(>Chi)`[2], #pval.treatment1_x_pval.treatment2_x_position = a$`Pr(>Chi)`[2], stringsAsFactors = F) # bind rows of temporary data frame to the results data frame results <- rbind(results, df) } # An error will be generated here for contrasts. #This potential for contrasts (interactions) is included in the case one wants to examine the role of position of CpG within a gene #Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : contrasts can be applied only to factors with 2 or more levels #Continuing the analysis from results line will generate the results in the absence of the contrast (interaction). results[is.na(results)] <- 0 results$adj.pval.treatment1 <- p.adjust(results$pval.treatment1, method='BH') results$adj.pval.treatment2 <- p.adjust(results$pval.treatment2, method='BH') results$adj.pval.treatment1_x_treatment2 <- p.adjust(results$pval.treatment1_x_treatment2, method='BH') #results$adj.pval.treatment1_x_position <- p.adjust(results$pval.treatment1_x_position, method='BH') #results$adj.pval.treatment2_x_position <- p.adjust(results$pval.treatment2_x_position, method='BH') #results$adj.pval.treatment1_x_pval.treatment2_x_position <- p.adjust(results$pval.treatment1_x_pval.treatment2_x_position, method='BH') sig <-results sig <- sig[,c(1,5:7)] # Identifying DMG with significant interaction sig.phenotype <- sig %>% filter(adj.pval.treatment1<0.01) sig.phenotype <- sig.phenotype[order(sig.phenotype$adj.pval.treatment1),] sig.phenotype <- sig.phenotype[1:100,1:2] sig.month <- sig %>% filter(adj.pval.treatment2<0.01) sig.month <- sig.month[order(sig.month$adj.pval.treatment2),] sig.month <- sig.month[1:100,1:2] sig.int <- sig %>% filter(adj.pval.treatment1_x_treatment2<0.01) sig.int <- sig.int[order(sig.int$adj.pval.treatment1_x_treatment2),] sig.int <- sig.int[1:100,1:2] # # Annotation of Secondary Exposure Interaction DMG # D145.sig.int.annot <- merge(D145.sig.int, Annot, by="gene", all.x=TRUE) # D145.sig.int.annot <- D145.sig.int.annot[order(D145.sig.int.annot$adj.pval.treatment1_x_treatment2),] # D145.sig.int.annot <- D145.sig.int.annot[!duplicated(D145.sig.int.annot$gene),] # #Sanity check plotting of top DMGs with sig interaction #Plotting reaction norm sub_meth_table %>% filter(gene== sig.phenotype$gene[1]) %>% group_by(gene,Phenotype, Month) %>% summarise(means = mean(per.meth), ses = std.error(per.meth)) %>% mutate(Month = factor(Month, levels=c("July", "December"))) %>% ggplot( aes(x=Month, y=means, group=Phenotype, colour=Phenotype, shape=Phenotype)) + #plot data geom_line(size=0.7, position=position_dodge(.1)) + #plot lines scale_colour_manual(values = c("gray", "black")) + #set line color geom_point(size=4, position=position_dodge(.1), colour="black") + #set point characteristics scale_shape_manual(values=c(1,18,16)) + #set shapes geom_errorbar(aes(ymin=means-ses, ymax=means+ses), #plot error bars width=0, position=position_dodge(.1), colour="black") + #set error bar characteristics xlab("Month") + #plot x axis label ylab("Percent Methylation") + #plot y axis label theme_bw() ``` #DEG Heatmap ```{r} #Month DMGs <- allgene[allgene$gene %in% sig.month$gene,] DMGs <- DMGs[,-8] Data<- DMGs %>% spread(key="gene", value=mean) Data.long <-Data row.names(Data.long) <- Data$Sample.ID Data.long <- Data.long[,-c(1:5)] Data.long <- data.matrix(Data.long, rownames.force=F) Data.long <-t(Data.long) Data.long <- as.matrix(Data.long) colnames(Data.long) <- Data$Sample.ID col_order <- c("37","38","40","41","44","45","47","51","52","57","39","42","43","46","50","54","55","56","58","59","16","22","23","28","29","2","31","32","35","4","17","18","21","24","25","26","30","33","34","6") Data.long <- Data.long[, col_order] Data.long df<- as.data.frame(Data[,2:3]) df$Month <- as.factor(as.character(df$Month)) #df$Coral.ID <- as.factor(as.character(df$Coral.ID)) rownames(df) <- Data$Sample.ID ann_colors = list( Phenotype = c(Bleached="cornsilk1", NonBleached="coral4"), Month = c(July = "darkorange", December = "cyan")) pdf("output/DMG_month.pdf") pheatmap(Data.long, color=inferno(10), cluster_rows=T, show_rownames=FALSE, show_colnames=TRUE, fontsize_row=8, scale="row", cluster_cols=F, annotation_col=df, annotation_colors=ann_colors, gaps_col = c(20)) dev.off() #Phenotype DMGs <- allgene[allgene$gene %in% sig.phenotype$gene,] DMGs <- DMGs[,-8] Data<- DMGs %>% spread(key="gene", value=mean) Data.long <-Data row.names(Data.long) <- Data$Sample.ID Data.long <- Data.long[,-c(1:5)] Data.long <- data.matrix(Data.long, rownames.force=F) Data.long <-t(Data.long) Data.long <- as.matrix(Data.long) colnames(Data.long) <- Data$Sample.ID col_order <- c("37","38","40","41","44","45","47","51","52","57","16","22", "23","28","29","2","31","32","35","4","39","42","43","46","50", "54","55","56","58","59","17","18","21","24","25","26","30","33","34","6") Data.long <- Data.long[, col_order] Data.long df<- as.data.frame(Data[,2:3]) df$Month <- as.factor(as.character(df$Month)) #df$Coral.ID <- as.factor(as.character(df$Coral.ID)) rownames(df) <- Data$Sample.ID ann_colors = list( Phenotype = c(Bleached="cornsilk1", NonBleached="coral4"), Month = c(July = "darkorange", December = "cyan")) pdf("output/DMG_Phenotype.pdf") pheatmap(Data.long, color=inferno(10), cluster_rows=T, show_rownames=FALSE, show_colnames=TRUE, fontsize_row=8, scale="row", cluster_cols=F, annotation_col=df, annotation_colors=ann_colors, gaps_col = c(20)) dev.off() #Interaction DMGs <- allgene[allgene$gene %in% sig.int$gene,] DMGs <- DMGs[,-8] Data<- DMGs %>% spread(key="gene", value=mean) Data.long <-Data row.names(Data.long) <- Data$Sample.ID Data.long <- Data.long[,-c(1:5)] Data.long <- data.matrix(Data.long, rownames.force=F) Data.long <-t(Data.long) Data.long <- as.matrix(Data.long) colnames(Data.long) <- Data$Sample.ID Data.long <- Data.long[, col_order] Data.long df<- as.data.frame(Data[,2:3]) df$Month <- as.factor(as.character(df$Month)) #df$Coral.ID <- as.factor(as.character(df$Coral.ID)) rownames(df) <- Data$Sample.ID ann_colors = list( Phenotype = c(Bleached="cornsilk1", NonBleached="coral4"), Month = c(July = "darkorange", December = "cyan")) pdf("output/DMG_Interaction.pdf") pheatmap(Data.long, color=inferno(10), cluster_rows=T, show_rownames=FALSE, show_colnames=TRUE, fontsize_row=8, scale="row", cluster_cols=F, annotation_col=df, annotation_colors=ann_colors, gaps_col = c(10,20,30)) dev.off() ``` METHYL KIT APPROACH #read in data cov files ```{r} data.list <- list("16_S138_10x_sorted_AllSamps_genes.tab", "17_S134_10x_sorted_AllSamps_genes.tab", "18_S154_10x_sorted_AllSamps_genes.tab", "2_S128_10x_sorted_AllSamps_genes.tab" , "21_S119_10x_sorted_AllSamps_genes.tab", "22_S120_10x_sorted_AllSamps_genes.tab", "23_S141_10x_sorted_AllSamps_genes.tab", "24_S147_10x_sorted_AllSamps_genes.tab", "25_S148_10x_sorted_AllSamps_genes.tab", "26_S121_10x_sorted_AllSamps_genes.tab", "28_S122_10x_sorted_AllSamps_genes.tab", "29_S153_10x_sorted_AllSamps_genes.tab", "30_S124_10x_sorted_AllSamps_genes.tab", "31_S127_10x_sorted_AllSamps_genes.tab", "32_S130_10x_sorted_AllSamps_genes.tab", "33_S142_10x_sorted_AllSamps_genes.tab", "34_S136_10x_sorted_AllSamps_genes.tab", "35_S137_10x_sorted_AllSamps_genes.tab", "37_S140_10x_sorted_AllSamps_genes.tab", "38_S129_10x_sorted_AllSamps_genes.tab", "39_S145_10x_sorted_AllSamps_genes.tab", "4_S146_10x_sorted_AllSamps_genes.tab", "40_S135_10x_sorted_AllSamps_genes.tab", "41_S151_10x_sorted_AllSamps_genes.tab", "42_S131_10x_sorted_AllSamps_genes.tab", "43_S143_10x_sorted_AllSamps_genes.tab", "44_S125_10x_sorted_AllSamps_genes.tab", "45_S156_10x_sorted_AllSamps_genes.tab", "46_S133_10x_sorted_AllSamps_genes.tab", "47_S155_10x_sorted_AllSamps_genes.tab", "50_S139_10x_sorted_AllSamps_genes.tab", "51_S126_10x_sorted_AllSamps_genes.tab", "52_S150_10x_sorted_AllSamps_genes.tab", "54_S144_10x_sorted_AllSamps_genes.tab", "55_S123_10x_sorted_AllSamps_genes.tab", "56_S149_10x_sorted_AllSamps_genes.tab", "57_S152_10x_sorted_AllSamps_genes.tab", "58_S157_10x_sorted_AllSamps_genes.tab", "59_S158_10x_sorted_AllSamps_genes.tab", "6_S132_10x_sorted_AllSamps_genes.tab") #The following command reads in coverage files data <-methRead(data.list, sample.id = list(), assembly = "Mcap_v2", treatment = ), context = "CpG", pipeline = "bismarkCoverage", mincov=10) ``` #corr plots ```{r} #Identify CG covered across all samples and generate a correlation plot #unite# meth_all<-methylKit::unite(data) nrow(data) #correlation plot# pdf("output/MethCompare_correlation_test.pdf", width = 1000, height = 1000) getCorrelation(meth_all,plot=TRUE) dev.off() ``` ```{r} clusterSamples(meth_all, dist="correlation", method="ward.D", plot=TRUE) PCASamples(meth_all) ```