--- title: "Expression_Methylation" author: "HM Putnam" date: "6/12/2020" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = FALSE) #source("https://bioconductor.org/biocLite.R") library("pheatmap") library("RColorBrewer") library("tidyverse") library("GSEABase") library("cowplot") library("lattice") library("latticeExtra") library("ggforce") library("goseq") ``` Goal is to identify and compare presence and absence and the expression levels of the genes captured by each method Load sample info, annotation, and counts ```{r load data} #load sample info sample.info <- read.csv(file="metadata/RNASeq_metadata.csv", header=T, sep=",", row.names=1) #separate sample info by species Mcap.sample.info <- subset(sample.info, Species == "Montipora capitata") #Mcap = Montipora capitata Pact.sample.info <- subset(sample.info, Species == "Pocillopora acuta") #Pact = Pocillopora acuta #Load Mcap Annotation Info Mcap.annot <- read.csv(file="genome-feature-files/Mcap-GO-KO-Kegg.tab", header=FALSE, sep="\t") colnames(Mcap.annot) <- c("Uniprot", "gene", "eval", "Prot.ID", "Rev", "Prot.Name.Long", "Prot.Name.Short", "Taxa", "Num", "GO1", "GO2", "GO3", "GO4", "GO.IDs","KEGG", "KEGG.Path") Mcap.annot <- Mcap.annot %>% distinct(gene, .keep_all = TRUE) #keep only distinct gene ids Mcap.annot$gene <- gsub("augustus.", "", Mcap.annot$gene) #remove excess characters from gene name Mcap.annot$gene <- gsub(".t1", "", Mcap.annot$gene) #remove excess characters from gene name #Identify Mcap full set of GO terms Mcap.GO <- Mcap.annot[,c(2,14)] #identify GOids splitted <- strsplit(as.character(Mcap.GO$GO.IDs), "; ") #slit into multiple GO ids Mcap.GOs <- data.frame(v1 = rep.int(Mcap.GO$gene, sapply(splitted, length)), v2 = unlist(splitted)) #list all GOs with each gene Mcap.GO.terms <- Mcap.GOs %>% distinct(v2, .keep_all = TRUE) #keep only distinct GO terms length(unique(Mcap.GO.terms$v2)) #view number of go terms #Load Pact Annotation Info Pact.annot <- read.csv(file="genome-feature-files/Pact-GO-KO-Kegg.tab", header=FALSE, sep="\t") colnames(Pact.annot) <- c("Uniprot", "gene", "eval", "Prot.ID", "Rev", "Prot.Name.Long", "Prot.Name.Short", "Taxa", "Num", "GO1", "GO2", "GO3", "GO4", "GO.IDs","KEGG", "KEGG.Path") Pact.annot <- Pact.annot %>% distinct(gene, .keep_all = TRUE) #keep only distinct gene ids #Identify Mcap full set of GO terms Pact.GO <- Pact.annot[,c(2,14)] #identify GOids splitted <- strsplit(as.character(Pact.GO$GO.IDs), "; ") #slit into multiple GO ids Pact.GOs <- data.frame(v1 = rep.int(Pact.GO$gene, sapply(splitted, length)), v2 = unlist(splitted)) #list all GOs with each gene Pact.GO.terms <- Pact.GOs %>% distinct(v2, .keep_all = TRUE) #keep only distinct GO terms length(unique(Pact.GO.terms$v2)) #view number of go terms #Load Mcap gene gff Mcap.genes <- read.csv(file="genome-feature-files/Mcap.GFFannotation.gff", header=FALSE, sep="\t") Mcap.genes <- filter(Mcap.genes, V3 == "gene") #load sample info colnames(Mcap.genes) <- c("scaffold", "AUGUSTUS", "id", "gene.start","gene.stop", "pos1", "pos2","pos3", "gene") Mcap.genes$gene <- gsub("ID=", "", Mcap.genes$gene) Mcap.genes$gene <- gsub(";.*", "", Mcap.genes$gene) Mcap.gene.len.df <- Mcap.genes Mcap.gene.len.df$length <- Mcap.gene.len.df$gene.stop-Mcap.gene.len.df$gene.start #write output of gene gff only write.table(Mcap.genes, file="genome-feature-files/Mcap.GFFannotation.genes.gff", sep="\t", col.names = FALSE, row.names=FALSE, quote=FALSE) Mcap.genes <- Mcap.genes[,c(1,4,5,9)] #Load Pact gene gff Pact.genes <- read.csv(file="genome-feature-files/Pact_Structural_annotation_abintio.gff", header=FALSE, sep="\t", skip=22) #load sample info Pact.genes <- filter(Pact.genes, V3 == "gene") Pact.genes$V1 <- gsub("cov", "_cov", Pact.genes$V1) write.table(Pact.genes, file="genome-feature-files/Pact.GFFannotation.genes.gff", sep="\t", col.names = FALSE, row.names=FALSE, quote=FALSE) Pact.genes <- Pact.genes[,c(1,4,5,9)] colnames(Pact.genes) <- c("scaffold", "gene.start","gene.stop", "gene") Pact.gene.len.df <- Pact.genes Pact.gene.len.df$length <- Pact.gene.len.df$gene.stop-Pact.gene.len.df$gene.start #load expression data #Montipora capitata counts Mcap.1101 <- read.csv(file="RNASeq/PolyA/1101_gene_abund.tab", header=TRUE, sep="\t") #Load expression matrix colnames(Mcap.1101)[c(8,9)] <- c("Mcap.1101.FPKM", "Mcap.1101.TPM") Mcap.1548 <- read.csv(file="RNASeq/PolyA/1548_gene_abund.tab", header=TRUE, sep="\t") #Load expression matrix colnames(Mcap.1548)[c(8,9)] <- c("Mcap.1548.FPKM", "Mcap.1548.TPM") Mcap.1628 <- read.csv(file="RNASeq/PolyA/1628_gene_abund.tab", header=TRUE, sep="\t") #Load expression matrix colnames(Mcap.1628)[c(8,9)] <- c("Mcap.1628.FPKM", "Mcap.1628.TPM") Mcap.counts <- left_join(Mcap.1101,Mcap.1548, by="Gene.ID" ) Mcap.counts <- left_join(Mcap.counts,Mcap.1628, by="Gene.ID" ) Mcap.counts <- Mcap.counts[,c(1,3,5,6,8,9,16,17, 24,25)] Mcap.counts$mean.FPKM <- rowMeans(Mcap.counts[,c(5,7,9)]) Mcap.counts$mean.TPM <- rowMeans(Mcap.counts[,c(6,8,10)]) colnames(Mcap.counts) <- c("gene", "scaffold", "start", "stop", "Mcap.1101.FPKM", "Mcap.1101.TPM", "Mcap.1548.FPKM", "Mcap.1548.TPM", "Mcap.1628.FPKM", "Mcap.1628.TPM", "mean.FPKM", "mean.TPM" ) range(Mcap.counts$mean.FPKM) mean(Mcap.counts$mean.FPKM) range(Mcap.counts$mean.TPM) mean(Mcap.counts$mean.TPM) #Pocillopora acuta counts Pact.1041 <- read.csv(file="RNASeq/PolyA/1041_gene_abund.tab", header=TRUE, sep="\t") #Load expression matrix colnames(Pact.1041)[c(8,9)] <- c("Pact.1041.FPKM", "Pact.1041.TPM") Pact.1471 <- read.csv(file="RNASeq/PolyA/1471_gene_abund.tab", header=TRUE, sep="\t") #Load expression matrix colnames(Pact.1471)[c(8,9)] <- c("Pact.1471.FPKM", "Pact.1471.TPM") Pact.1637 <- read.csv(file="RNASeq/PolyA/1637_gene_abund.tab", header=TRUE, sep="\t") #Load expression matrix colnames(Pact.1637)[c(8,9)] <- c("Pact.1637.FPKM", "Pact.1637.TPM") Pact.counts <- left_join(Pact.1041,Pact.1471, by="Gene.ID" ) Pact.counts <- left_join(Pact.counts,Pact.1637, by="Gene.ID" ) Pact.counts <- Pact.counts[,c(1,3,5,6,8,9,16,17, 24,25)] Pact.counts$mean.FPKM <- rowMeans(Pact.counts[,c(5,7,9)]) Pact.counts$mean.TPM <- rowMeans(Pact.counts[,c(6,8,10)]) colnames(Pact.counts) <- c("gene", "scaffold", "start", "stop", "Pact.1041.FPKM", "Pact.1041.TPM", "Pact.1548.FPKM", "Pact.1548.TPM", "Pact.1628.FPKM", "Pact.1628.TPM", "mean.FPKM", "mean.TPM" ) range(Pact.counts$mean.FPKM) mean(Pact.counts$mean.FPKM) range(Pact.counts$mean.TPM) mean(Pact.counts$mean.TPM) ``` Obtain the genes from both genomes that have CpG data with 5x coverage ```{bash} #Intersect Mcap CpG with genes for f in data/Mcap_tab/*5x.tab do intersectBed \ -wb \ -a ${f} \ -b genome-feature-files/Mcap.GFFannotation.genes.gff \ > ${f}_gene done #Intersect Pact CpG with genes for f in data/Pact_tab/*5x.tab do intersectBed \ -wb \ -a ${f} \ -b genome-feature-files/Pact.GFFannotation.genes.gff \ > ${f}_gene done ``` Load M. capitata (Mcap) Methylation data and ID genes and GO terms in each method ```{r} #load methylation data with minimum of 5x coverage of CpGs that has intersected with genes #1101 1548 1628 WGBS.1101 <- read.csv("data/Mcap_tab/Meth10_R1_001_val_1_bismark_bt2_pe._5x.tab_gene", header=FALSE, sep="\t") colnames(WGBS.1101) <- c("scaffold", "start", "stop", "permeth","meth", "unmeth", "intersection", "AUGUSTUS", "id", "gene.start","gene.stop", "pos1", "pos2","pos3", "gene") WGBS.1548 <- read.csv("data/Mcap_tab/Meth11_R1_001_val_1_bismark_bt2_pe._5x.tab_gene", header=FALSE, sep="\t") colnames(WGBS.1548) <- c("scaffold", "start", "stop", "permeth","meth", "unmeth", "intersection", "AUGUSTUS", "id", "gene.start","gene.stop", "pos1", "pos2","pos3", "gene") WGBS.1628 <- read.csv("data/Mcap_tab/Meth12_R1_001_val_1_bismark_bt2_pe._5x.tab_gene", header=FALSE, sep="\t") colnames(WGBS.1628) <- c("scaffold", "start", "stop", "permeth","meth", "unmeth", "intersection", "AUGUSTUS", "id", "gene.start","gene.stop", "pos1", "pos2","pos3", "gene") #Determine GO terms for the genes found with each method # WGBS samples WGBS.data <- rbind(WGBS.1101,WGBS.1548,WGBS.1628) #combine samples into df by method WGBS.genes <- as.data.frame(unique(WGBS.data$gene)) #keep unique genes only colnames(WGBS.genes) <- c("gene") #name gene column WGBS.gene.annot <- left_join(WGBS.genes, Mcap.annot, by="gene") #combine genes and annotation nrow(WGBS.genes) WGBS.GO <- WGBS.gene.annot[,c(1,14)] #identify GOids splitted <- strsplit(as.character(WGBS.GO$GO.IDs), "; ") #slit into multiple GO ids WGBS.GOs <- data.frame(v1 = rep.int(WGBS.GO$gene, sapply(splitted, length)), v2 = unlist(splitted)) #list all GOs with each gene WGBS.GO.terms <- WGBS.GOs %>% distinct(v2, .keep_all = TRUE) #keep only distinct GO terms length(unique(WGBS.GO.terms$v2)) #view number of go terms #RRBS samples RRBS.1101 <- read.csv("data/Mcap_tab/Meth13_R1_001_val_1_bismark_bt2_pe._5x.tab_gene", header=FALSE, sep="\t") colnames(RRBS.1101) <- c("scaffold", "start", "stop", "permeth","meth", "unmeth", "intersection", "AUGUSTUS", "id", "gene.start","gene.stop", "pos1", "pos2","pos3", "gene") RRBS.1548 <- read.csv("data/Mcap_tab/Meth14_R1_001_val_1_bismark_bt2_pe._5x.tab_gene", header=FALSE, sep="\t") colnames(RRBS.1548) <- c("scaffold", "start", "stop", "permeth","meth", "unmeth", "intersection", "AUGUSTUS", "id", "gene.start","gene.stop", "pos1", "pos2","pos3", "gene") RRBS.1628 <- read.csv("data/Mcap_tab/Meth15_R1_001_val_1_bismark_bt2_pe._5x.tab_gene", header=FALSE, sep="\t") colnames(RRBS.1628) <- c("scaffold", "start", "stop", "permeth","meth", "unmeth", "intersection", "AUGUSTUS", "id", "gene.start","gene.stop", "pos1", "pos2","pos3", "gene") RRBS.data <- rbind(RRBS.1101,RRBS.1548,RRBS.1628) #combine samples into df by method RRBS.genes <- as.data.frame(unique(RRBS.data$gene)) #keep unique genes only colnames(RRBS.genes) <- c("gene") #name gene column RRBS.gene.annot <- left_join(RRBS.genes, Mcap.annot, by="gene") #combine genes and annotation nrow(RRBS.genes) RRBS.GO <- RRBS.gene.annot[,c(1,14)] #identify GOids splitted <- strsplit(as.character(RRBS.GO$GO.IDs), "; ") #slit into multiple GO ids RRBS.GOs <- data.frame(v1 = rep.int(RRBS.GO$gene, sapply(splitted, length)), v2 = unlist(splitted)) #list all GOs with each gene RRBS.GO.terms <- RRBS.GOs %>% distinct(v2, .keep_all = TRUE) #keep only distinct GO terms length(unique(RRBS.GO.terms$v2)) #view number of go terms MBDBS.1101 <- read.csv("data/Mcap_tab/Meth16_R1_001_val_1_bismark_bt2_pe._5x.tab_gene", header=FALSE, sep="\t") colnames(MBDBS.1101) <- c("scaffold", "start", "stop", "permeth","meth", "unmeth", "intersection", "AUGUSTUS", "id", "gene.start","gene.stop", "pos1", "pos2","pos3", "gene") MBDBS.1548 <- read.csv("data/Mcap_tab/Meth17_R1_001_val_1_bismark_bt2_pe._5x.tab_gene", header=FALSE, sep="\t") colnames(MBDBS.1548) <- c("scaffold", "start", "stop", "permeth","meth", "unmeth", "intersection", "AUGUSTUS", "id", "gene.start","gene.stop", "pos1", "pos2","pos3", "gene") MBDBS.1628 <- read.csv("data/Mcap_tab/Meth18_R1_001_val_1_bismark_bt2_pe._5x.tab_gene", header=FALSE, sep="\t") colnames(MBDBS.1628) <- c("scaffold", "start", "stop", "permeth","meth", "unmeth", "intersection", "AUGUSTUS", "id", "gene.start","gene.stop", "pos1", "pos2","pos3", "gene") MBDBS.data <- rbind(MBDBS.1101,MBDBS.1548,MBDBS.1628) #combine samples into df by method MBDBS.genes <- as.data.frame(unique(MBDBS.data$gene)) #keep unique genes only colnames(MBDBS.genes) <- c("gene") #name gene column MBDBS.gene.annot <- left_join(MBDBS.genes, Mcap.annot, by="gene") #combine genes and annotation nrow(MBDBS.genes) MBDBS.GO <- MBDBS.gene.annot[,c(1,14)] #identify GOids splitted <- strsplit(as.character(MBDBS.GO$GO.IDs), "; ") #slit into multiple GO ids MBDBS.GOs <- data.frame(v1 = rep.int(MBDBS.GO$gene, sapply(splitted, length)), v2 = unlist(splitted)) #list all GOs with each gene MBDBS.GO.terms <- MBDBS.GOs %>% distinct(v2, .keep_all = TRUE) #keep only distinct GO terms length(unique(MBDBS.GO.terms$v2)) #view number of go terms #combine genes indentified by each method into dataframe for plotting WGBS.genes.circ <- as.data.frame(WGBS.genes) WGBS.genes.circ$method <- "WGBS" colnames(WGBS.genes.circ) <- c("gene", "method") RRBS.genes.circ <- as.data.frame(RRBS.genes) RRBS.genes.circ$method <- "RRBS" colnames(RRBS.genes.circ) <- c("gene", "method") MBDBS.genes.circ <- as.data.frame(MBDBS.genes) MBDBS.genes.circ$method <- "MBDBS" colnames(MBDBS.genes.circ) <- c("gene", "method") #combine 3 methods together All.circ <- left_join(Mcap.genes, WGBS.genes.circ, by="gene", keep=TRUE) colnames(All.circ)[4] <- "gene" All.circ <- left_join(All.circ, RRBS.genes.circ, by="gene", keep=TRUE) colnames(All.circ)[4] <- "gene" All.circ <- left_join(All.circ, MBDBS.genes.circ, by="gene", keep=TRUE) colnames(All.circ) <- c("scaffold","gene.start", "gene.stop", "gene","WGBS","method.WGBS", "RRBS", "method.RRBS", "MBDBS", "method.MBDBS" ) #order the dataframe by scaffold and gene start position All.circ.df <- arrange(All.circ,scaffold, gene.start) All.circ.data <- All.circ.df[,c(1:5,7,9)] str(All.circ.data) All.circ.data$WGBS <- as.character(as.factor(All.circ.data$WGBS)) All.circ.data$RRBS <- as.character(as.factor(All.circ.data$RRBS)) All.circ.data$MBDBS <- as.character(as.factor(All.circ.data$MBDBS)) str(All.circ.data) # Genes with NA are those not found in a dataset and assigned to 0 # Genes present are assigned a 1 All.circ.data[is.na(All.circ.data)] <- 0 #change NA to 0 All.circ.data$WGBS[All.circ.data$WGBS != 0] <- 1 #change gene to 1 All.circ.data$RRBS[All.circ.data$RRBS != 0] <- 1 #change gene to 1 All.circ.data$MBDBS[All.circ.data$MBDBS != 0] <- 1 #change gene to 1 str(All.circ.data) #change 0 and 1 to numeric All.circ.data$WGBS <- as.numeric(as.character(All.circ.data$WGBS)) All.circ.data$RRBS <- as.numeric(as.character(All.circ.data$RRBS)) All.circ.data$MBDBS <- as.numeric(as.character(All.circ.data$MBDBS)) str(All.circ.data) #genes present by method Mcap.CpG.gene.counts <- left_join(All.circ.data, Mcap.counts, by="gene") Mcap.CpG.gene.counts <- Mcap.CpG.gene.counts[,c(4,5,6,7,18)] Mcap.WGBS.counts <- Mcap.CpG.gene.counts[,c(1,2,5)] colnames(Mcap.WGBS.counts)[2] <- "type" Mcap.WGBS.counts$method <- "WGBS" Mcap.RRBS.counts <- Mcap.CpG.gene.counts[,c(1,3,5)] colnames(Mcap.RRBS.counts)[2] <- "type" Mcap.RRBS.counts$method <- "RRBS" Mcap.MBDBS.counts <- Mcap.CpG.gene.counts[,c(1,4,5)] colnames(Mcap.MBDBS.counts)[2] <- "type" Mcap.MBDBS.counts$method <- "MBDBS" Mcap.CpG.genes.all <- rbind(Mcap.WGBS.counts, Mcap.RRBS.counts, Mcap.MBDBS.counts) Mcap.CpG.genes.all$gene <- factor(Mcap.CpG.genes.all$gene, levels = All.circ.df$gene) pdf("Output/Mcap_PresAbs_genes.pdf") ggplot(Mcap.CpG.genes.all, aes(gene, method, fill=type, width = 1)) + geom_tile() + coord_fixed(ratio = 7000)+ scale_fill_gradientn(colours=c("black", "yellow"))+ ggtitle("A ")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size =1), #Set the text angle axis.line = element_line(color = 'black'), #Set the axes color panel.border = element_blank(), #Set the border panel.grid.major = element_blank(), #Set the major gridlines panel.grid.minor = element_blank(), #Set the minor gridlines plot.background=element_blank()) #Set the plot background dev.off() ``` Gene expresion counts for genes with CpG identified by each method ```{r} Mcap.WGBS.counts$type[Mcap.WGBS.counts$type==0] <- NA Mcap.WGBS.counts <- na.omit(Mcap.WGBS.counts) Mcap.RRBS.counts$type[Mcap.RRBS.counts$type==0] <- NA Mcap.RRBS.counts <- na.omit(Mcap.RRBS.counts) Mcap.MBDBS.counts$type[Mcap.MBDBS.counts$type==0] <- NA Mcap.MBDBS.counts <- na.omit(Mcap.MBDBS.counts) Mcap.CpG.counts.present <- rbind(Mcap.WGBS.counts, Mcap.RRBS.counts, Mcap.MBDBS.counts) Mcap.CpG.counts.present$plusone <- log10(Mcap.CpG.counts.present$mean.TPM+1) #cat.cols <- c("#E6550D", "#756BB1", "#31A354") #WGBS, RRBS, MBDBS Mcap.WGBS.RelFreq <- filter(Mcap.CpG.counts.present, method =="WGBS") Mcap.RRBS.RelFreq <- filter(Mcap.CpG.counts.present, method =="RRBS") Mcap.MBDBS.RelFreq <- filter(Mcap.CpG.counts.present, method =="MBDBS") Mcap.WGBS.Hist <-hist(Mcap.WGBS.RelFreq$plusone, plot=F, breaks=200) Mcap.WGBS.Hist$counts <- Mcap.WGBS.Hist$counts / sum(Mcap.WGBS.Hist$counts) plot(Mcap.WGBS.Hist, freq=TRUE, ylab="Relative Frequency", color="green", ylim=c(0,0.1)) plot_multi_histogram <- function(df, feature, label_column) { plt <- ggplot(df, aes(x=eval(parse(text=feature)), fill=eval(parse(text=label_column)))) + geom_histogram(aes(y = stat(count) / sum(count)), bins = 50) + scale_fill_manual(values=c("#E6550D", "#756BB1", "#31A354")) + labs(x="Log10(Mean TPM +1)", y = "Relative Frequency")+ theme_classic()+ facet_zoom(ylim = c(0, 0.12)) plt + guides(fill=guide_legend(title=label_column)) } #GO Enrichment by method ALL<- Mcap.genes$gene #set the all transcripts list Meth <- Mcap.MBDBS.counts$gene #set the enrichment test list LENGTH <- Mcap.gene.len.df #change contig lists to vectors ALL.vector <-c(t(ALL)) Meth.vector <-c(t(Meth)) ID.vector <- LENGTH$gene LENGTH.vector <- LENGTH$length gene.vector=as.integer(ALL.vector%in%Meth.vector) #Construct new vector with 1 for DEG and 0 for others names(gene.vector)=ALL.vector #set names DEG.pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector) #weight vector by length of gene #Find enriched GO terms, "selection-unbiased testing for category enrichment amongst differentially expressed (DE) genes for RNA-seq data" GO.wall<-goseq(DEG.pwf, ID.vector, gene2cat=Mcap.GO.terms, test.cats=c("GO:CC", "GO:BP", "GO:MF"), method="Wallenius", use_genes_without_cat=TRUE) #How many enriched GO terms do we have class(GO.wall) head(GO.wall) tail(GO.wall) nrow(GO.wall) #distribution of genes absent Mcap.WGBS.counts <- Mcap.CpG.gene.counts[,c(1,2,5)] Mcap.WGBS.counts$WGBS[Mcap.WGBS.counts$WGBS==1] <- NA Mcap.WGBS.counts <- na.omit(Mcap.WGBS.counts) Mcap.WGBS.counts$method <- "WGBS" colnames(Mcap.WGBS.counts)[2] <- "type" Mcap.RRBS.counts <- Mcap.CpG.gene.counts[,c(1,3,5)] Mcap.RRBS.counts$RRBS[Mcap.RRBS.counts$RRBS==1] <- NA Mcap.RRBS.counts <- na.omit(Mcap.RRBS.counts) Mcap.RRBS.counts$method <- "RRBS" colnames(Mcap.RRBS.counts)[2] <- "type" Mcap.MBDBS.counts <- Mcap.CpG.gene.counts[,c(1,4,5)] Mcap.MBDBS.counts$MBDBS[Mcap.MBDBS.counts$MBDBS==1] <- NA Mcap.MBDBS.counts <- na.omit(Mcap.MBDBS.counts) Mcap.MBDBS.counts$method <- "MBDBS" colnames(Mcap.MBDBS.counts)[2] <- "type" Mcap.CpG.counts.absent <- rbind(Mcap.WGBS.counts, Mcap.RRBS.counts, Mcap.MBDBS.counts) Mcap.CpG.counts.absent$plusone <- log10(Mcap.CpG.counts.absent$mean.TPM+1) Mcap.WGBS.RelFreq <- filter(Mcap.CpG.counts.absent, method =="WGBS") Mcap.RRBS.RelFreq <- filter(Mcap.CpG.counts.absent, method =="RRBS") Mcap.MBDBS.RelFreq <- filter(Mcap.CpG.counts.absent, method =="MBDBS") #GO Enrichment by method ALL<- Mcap.genes$gene #set the all transcripts list Meth <- Mcap.WGBS.counts$gene #set the enrichment test list LENGTH <- Mcap.gene.len.df #change contig lists to vectors ALL.vector <-c(t(ALL)) Meth.vector <-c(t(Meth)) ID.vector <- LENGTH$gene LENGTH.vector <- LENGTH$length gene.vector=as.integer(ALL.vector%in%Meth.vector) #Construct new vector with 1 for DEG and 0 for others names(gene.vector)=ALL.vector #set names DEG.pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector) #weight vector by length of gene #Find enriched GO terms, "selection-unbiased testing for category enrichment amongst differentially expressed (DE) genes for RNA-seq data" GO.wall<-goseq(DEG.pwf, ID.vector, gene2cat=Mcap.GO.terms, test.cats=c("GO:CC", "GO:BP", "GO:MF"), method="Wallenius", use_genes_without_cat=TRUE) #How many enriched GO terms do we have class(GO.wall) head(GO.wall) tail(GO.wall) nrow(GO.wall) ``` ```{r} # Plotting Distributions pdf("Output/Mcap.gene_expression_present.pdf", width=8.5, height=6) McapEP <- plot_multi_histogram(Mcap.CpG.counts.present, 'plusone', 'method') dev.off() pdf("Output/Mcap.gene_expression_absent.pdf", width=8.5, height=6) McapEA <- plot_multi_histogram(Mcap.CpG.counts.absent, 'plusone', 'method') dev.off() plot_grid(McapEP, McapEA, ncol=1, nrow=2, align="v", labels = c('A) Genes with CpG Present', 'B) Genes with CpG Absent'), label_size = 12) ggsave("Output/Mcap.gene_expression_distributions.pdf", width=8.5, height=6) ``` Mcap GO Slim Term comparison in genes identifed by each method ```{r} #GO Term union df GO.ALL <- left_join(WGBS.GO.terms, RRBS.GO.terms, by="v1", all=TRUE) GO.ALL <- left_join(GO.ALL , MBDBS.GO.terms, by="v1", all=TRUE) # GO Slim fl <- "http://www.geneontology.org/ontology/subsets/goslim_generic.obo" slim <- getOBOCollection(fl) WGBS.Ids <- na.omit(as.character(WGBS.GO.terms$v2)) WGBS.Collection <- GOCollection(WGBS.Ids) WGBS.slims <- goSlim(WGBS.Collection, slim, "BP", evidenceCode="TAS") WGBS.slims <- WGBS.slims[order(WGBS.slims$Term),] WGBS.slims$method <- "WGBS" WGBS.slims <- WGBS.slims[-5,] RRBS.Ids <- na.omit(as.character(RRBS.GO.terms$v2)) RRBS.Collection <- GOCollection(RRBS.Ids) RRBS.slims <- goSlim(RRBS.Collection, slim, "BP", evidenceCode="TAS") RRBS.slims <- RRBS.slims[order(RRBS.slims$Term),] RRBS.slims$method <- "RRBS" RRBS.slims <- RRBS.slims[-5,] MBDBS.Ids <- na.omit(as.character(MBDBS.GO.terms$v2)) MBDBS.Collection <- GOCollection(MBDBS.Ids) MBDBS.slims <- goSlim(MBDBS.Collection, slim, "BP", evidenceCode="TAS") MBDBS.slims <- MBDBS.slims[order(MBDBS.slims$Term),] MBDBS.slims$method <- "MBDBS" MBDBS.slims <- MBDBS.slims[-5,] All.slims <- rbind(WGBS.slims,RRBS.slims,MBDBS.slims) cat.cols <- c("springgreen4","orangered1", "skyblue3") #plot percent goslim per method pdf("Output/Mcap_GOSlim_compare.pdf") ggplot(All.slims, aes(fill=method, y=Percent, x=Term)) + geom_bar(position="dodge", stat="identity") + scale_fill_manual(values=cat.cols) + theme(axis.text.x = element_text(angle = 90, size=4)) + theme(legend.position = "none") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black")) dev.off() ``` Pocillopora acuta ```{r} #load methylation data with minimum of 5x coverage of CpGs that has intersected with genes #1041 1471 1637 WGBS.1041 <- read.csv("data/Pact_tab/Meth1_R1_001_val_1_bismark_bt2_pe._5x.tab_gene", header=FALSE, sep="\t") colnames(WGBS.1041) <- c("scaffold", "start", "stop", "permeth","meth", "unmeth", "intersection", "AUGUSTUS", "id", "gene.start","gene.stop", "pos1", "pos2","pos3", "gene") WGBS.1471 <- read.csv("data/Pact_tab/Meth2_R1_001_val_1_bismark_bt2_pe._5x.tab_gene", header=FALSE, sep="\t") colnames(WGBS.1471) <- c("scaffold", "start", "stop", "permeth","meth", "unmeth", "intersection", "AUGUSTUS", "id", "gene.start","gene.stop", "pos1", "pos2","pos3", "gene") WGBS.1637 <- read.csv("data/Pact_tab/Meth3_R1_001_val_1_bismark_bt2_pe._5x.tab_gene", header=FALSE, sep="\t") colnames(WGBS.1637) <- c("scaffold", "start", "stop", "permeth","meth", "unmeth", "intersection", "AUGUSTUS", "id", "gene.start","gene.stop", "pos1", "pos2","pos3", "gene") Pact.WGBS.data <- rbind(WGBS.1041,WGBS.1471,WGBS.1637) Pact.WGBS.genes <- as.data.frame(unique(Pact.WGBS.data$gene)) colnames(Pact.WGBS.genes) <- c("gene") Pact.WGBS.gene.annot <- left_join(Pact.WGBS.genes, Pact.annot, by="gene") nrow(Pact.WGBS.genes) Pact.WGBS.GO <- Pact.WGBS.gene.annot[,c(1,14)] #identify GOids splitted <- strsplit(as.character(Pact.WGBS.GO$GO.IDs), "; ") #split into multiple GO ids Pact.WGBS.GOs <- data.frame(v1 = rep.int(Pact.WGBS.GO$gene, sapply(splitted, length)), v2 = unlist(splitted)) #list all GOs with each assigned transcript Pact.WGBS.GO.terms <- Pact.WGBS.GOs %>% distinct(v2, .keep_all = TRUE) length(unique(Pact.WGBS.GO.terms$v2)) RRBS.1041 <- read.csv("data/Pact_tab/Meth4_R1_001_val_1_bismark_bt2_pe._5x.tab_gene", header=FALSE, sep="\t") colnames(RRBS.1041) <- c("scaffold", "start", "stop", "permeth","meth", "unmeth", "intersection", "AUGUSTUS", "id", "gene.start","gene.stop", "pos1", "pos2","pos3", "gene") RRBS.1471 <- read.csv("data/Pact_tab/Meth5_R1_001_val_1_bismark_bt2_pe._5x.tab_gene", header=FALSE, sep="\t") colnames(RRBS.1471) <- c("scaffold", "start", "stop", "permeth","meth", "unmeth", "intersection", "AUGUSTUS", "id", "gene.start","gene.stop", "pos1", "pos2","pos3", "gene") RRBS.1637 <- read.csv("data/Pact_tab/Meth6_R1_001_val_1_bismark_bt2_pe._5x.tab_gene", header=FALSE, sep="\t") colnames(RRBS.1637) <- c("scaffold", "start", "stop", "permeth","meth", "unmeth", "intersection", "AUGUSTUS", "id", "gene.start","gene.stop", "pos1", "pos2","pos3", "gene") Pact.RRBS.data <- rbind(RRBS.1041,RRBS.1471,RRBS.1637) Pact.RRBS.genes <- as.data.frame(unique(Pact.RRBS.data$gene)) colnames(Pact.RRBS.genes) <- c("gene") Pact.RRBS.gene.annot <- left_join(Pact.RRBS.genes, Pact.annot, by="gene") nrow(Pact.RRBS.genes) Pact.RRBS.GO <- Pact.RRBS.gene.annot[,c(1,14)] #identify GOids splitted <- strsplit(as.character(Pact.RRBS.GO$GO.IDs), "; ") #slit into multiple GO ids Pact.RRBS.GOs <- data.frame(v1 = rep.int(Pact.RRBS.GO$gene, sapply(splitted, length)), v2 = unlist(splitted)) #list all GOs with each assigned transcript Pact.RRBS.GO.terms <- Pact.RRBS.GOs %>% distinct(v2, .keep_all = TRUE) length(unique(Pact.RRBS.GO.terms$v2)) MBDBS.1041 <- read.csv("data/Pact_tab/Meth7_R1_001_val_1_bismark_bt2_pe._5x.tab_gene", header=FALSE, sep="\t") colnames(MBDBS.1041) <- c("scaffold", "start", "stop", "permeth","meth", "unmeth", "intersection", "AUGUSTUS", "id", "gene.start","gene.stop", "pos1", "pos2","pos3", "gene") MBDBS.1471 <- read.csv("data/Pact_tab/Meth8_R1_001_val_1_bismark_bt2_pe._5x.tab_gene", header=FALSE, sep="\t") colnames(MBDBS.1471) <- c("scaffold", "start", "stop", "permeth","meth", "unmeth", "intersection", "AUGUSTUS", "id", "gene.start","gene.stop", "pos1", "pos2","pos3", "gene") MBDBS.1637 <- read.csv("data/Pact_tab/Meth9_R1_001_val_1_bismark_bt2_pe._5x.tab_gene", header=FALSE, sep="\t") colnames(MBDBS.1637) <- c("scaffold", "start", "stop", "permeth","meth", "unmeth", "intersection", "AUGUSTUS", "id", "gene.start","gene.stop", "pos1", "pos2","pos3", "gene") Pact.MBDBS.data <- rbind(MBDBS.1101,MBDBS.1548,MBDBS.1628) Pact.MBDBS.genes <- as.data.frame(unique(Pact.MBDBS.data$gene)) colnames(Pact.MBDBS.genes) <- c("gene") Pact.MBDBS.gene.annot <- left_join(Pact.MBDBS.genes, Pact.annot, by="gene") nrow(MBDBS.genes) Pact.MBDBS.GO <- Pact.MBDBS.gene.annot[,c(1,14)] #identify GOids splitted <- strsplit(as.character(Pact.MBDBS.GO$GO.IDs), "; ") #slit into multiple GO ids Pact.MBDBS.GOs <- data.frame(v1 = rep.int(Pact.MBDBS.GO$gene, sapply(splitted, length)), v2 = unlist(splitted)) #list all GOs with each assigned transcript Pact.MBDBS.GO.terms <- Pact.MBDBS.GOs %>% distinct(v2, .keep_all = TRUE) length(unique(Pact.MBDBS.GO.terms$v2)) Pact.WGBS.genes.circ <- as.data.frame(Pact.WGBS.genes) Pact.WGBS.genes.circ$method <- "WGBS" colnames(Pact.WGBS.genes.circ) <- c("gene", "method") Pact.RRBS.genes.circ <- as.data.frame(Pact.RRBS.genes) Pact.RRBS.genes.circ$method <- "RRBS" colnames(Pact.RRBS.genes.circ) <- c("gene", "method") Pact.MBDBS.genes.circ <- as.data.frame(Pact.MBDBS.genes) Pact.MBDBS.genes.circ$method <- "MBDBS" colnames(Pact.MBDBS.genes.circ) <- c("gene", "method") Pact.All.circ <- left_join(Pact.genes, Pact.WGBS.genes.circ, by="gene", keep=TRUE) colnames(Pact.All.circ)[4] <- "gene" Pact.All.circ <- left_join(Pact.All.circ, Pact.RRBS.genes.circ, by="gene", keep=TRUE) colnames(Pact.All.circ)[4] <- "gene" Pact.All.circ <- left_join(Pact.All.circ, Pact.MBDBS.genes.circ, by="gene", keep=TRUE) colnames(Pact.All.circ) <- c("scaffold","gene.start", "gene.stop", "gene","WGBS","method.WGBS", "RRBS", "method.RRBS", "MBDBS", "method.MBDBS" ) Pact.All.circ.df <- arrange(Pact.All.circ,scaffold, gene.start) Pact.All.circ.data <- Pact.All.circ.df[,c(1:5,7,9)] str(Pact.All.circ.data) Pact.All.circ.data$WGBS <- as.character(as.factor(Pact.All.circ.data$WGBS)) Pact.All.circ.data$RRBS <- as.character(as.factor(Pact.All.circ.data$RRBS)) Pact.All.circ.data$MBDBS <- as.character(as.factor(Pact.All.circ.data$MBDBS)) str(Pact.All.circ.data) Pact.All.circ.data[is.na(Pact.All.circ.data)] <- 0 #change NA to 0 Pact.All.circ.data$WGBS[Pact.All.circ.data$WGBS != 0] <- 1 #change gene to 1 Pact.All.circ.data$RRBS[Pact.All.circ.data$RRBS != 0] <- 1 #change gene to 1 Pact.All.circ.data$MBDBS[Pact.All.circ.data$MBDBS != 0] <- 1 #change gene to 1 str(Pact.All.circ.data) Pact.All.circ.data$WGBS <- as.numeric(as.character(Pact.All.circ.data$WGBS)) Pact.All.circ.data$RRBS <- as.numeric(as.character(Pact.All.circ.data$RRBS)) Pact.All.circ.data$MBDBS <- as.numeric(as.character(Pact.All.circ.data$MBDBS)) str(Pact.All.circ.data) #genes present by method Pact.CpG.gene.counts <- left_join(Pact.All.circ.data, Pact.counts, by="gene") Pact.CpG.gene.counts <- Pact.CpG.gene.counts[,c(4,5,6,7,18)] Pact.WGBS.counts <- Pact.CpG.gene.counts[,c(1,2,5)] colnames(Pact.WGBS.counts)[2] <- "type" Pact.WGBS.counts$method <- "WGBS" Pact.RRBS.counts <- Pact.CpG.gene.counts[,c(1,3,5)] colnames(Pact.RRBS.counts)[2] <- "type" Pact.RRBS.counts$method <- "RRBS" Pact.MBDBS.counts <- Pact.CpG.gene.counts[,c(1,4,5)] colnames(Pact.MBDBS.counts)[2] <- "type" Pact.MBDBS.counts$method <- "MBDBS" Pact.CpG.genes.all <- rbind(Pact.WGBS.counts,Pact.RRBS.counts, Pact.MBDBS.counts) Pact.CpG.genes.all$gene <- factor(Pact.CpG.genes.all$gene, levels = Pact.All.circ.df$gene) pdf("Output/Pact_PresAbs_genes.pdf") ggplot(Pact.CpG.genes.all, aes(gene, method, fill=type, width = 1)) + geom_tile() + coord_fixed(ratio = 6000)+ scale_fill_gradientn(colours=c("black", "yellow"))+ ggtitle("B ")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size =1), #Set the text angle axis.line = element_line(color = 'black'), #Set the axes color panel.border = element_blank(), #Set the border panel.grid.major = element_blank(), #Set the major gridlines panel.grid.minor = element_blank(), #Set the minor gridlines plot.background=element_blank()) #Set the plot background dev.off() #plot_grid(Mcap.CpG.genes.heatmap, Pact.CpG.genes.heatmap, ncol=1, align="v") #ggsave("Output/gene_heatmaps.pdf", width=8.5, height=6) Pact.WGBS.counts$type[Pact.WGBS.counts$type==0] <- NA Pact.WGBS.counts <- na.omit(Pact.WGBS.counts) Pact.RRBS.counts$type[Pact.RRBS.counts$type==0] <- NA Pact.RRBS.counts <- na.omit(Pact.RRBS.counts) Pact.MBDBS.counts$type[Pact.MBDBS.counts$type==0] <- NA Pact.MBDBS.counts <- na.omit(Pact.MBDBS.counts) Pact.CpG.counts.present <- rbind(Pact.WGBS.counts, Pact.RRBS.counts, Pact.MBDBS.counts) Pact.CpG.counts.present$plusone <- log10(Pact.CpG.counts.present$mean.TPM+1) #cat.cols <- c("#EDF8E9", "#F2F0F7", "#FEEDDE") Pact.WGBS.RelFreq <- filter(Pact.CpG.counts.present, method =="WGBS") Pact.RRBS.RelFreq <- filter(Pact.CpG.counts.present, method =="RRBS") Pact.MBDBS.RelFreq <- filter(Pact.CpG.counts.present, method =="MBDBS") Pact.WGBS.Hist <-hist(Pact.WGBS.RelFreq$plusone, plot=F, breaks=200) Pact.WGBS.Hist$counts <- Pact.WGBS.Hist$counts / sum(Pact.WGBS.Hist$counts) plot(Pact.WGBS.Hist, freq=TRUE, ylab="Relative Frequency", color="green", ylim=c(0,0.1)) #distribution of genes absent Pact.WGBS.counts <- Pact.CpG.gene.counts[,c(1,2,5)] Pact.WGBS.counts$WGBS[Pact.WGBS.counts$WGBS==1] <- NA Pact.WGBS.counts <- na.omit(Pact.WGBS.counts) Pact.WGBS.counts$method <- "WGBS" colnames(Pact.WGBS.counts)[2] <- "type" Pact.RRBS.counts <- Pact.CpG.gene.counts[,c(1,3,5)] Pact.RRBS.counts$RRBS[Pact.RRBS.counts$RRBS==1] <- NA Pact.RRBS.counts <- na.omit(Pact.RRBS.counts) Pact.RRBS.counts$method <- "RRBS" colnames(Pact.RRBS.counts)[2] <- "type" Pact.MBDBS.counts <- Pact.CpG.gene.counts[,c(1,4,5)] Pact.MBDBS.counts$MBDBS[Pact.MBDBS.counts$MBDBS==1] <- NA Pact.MBDBS.counts <- na.omit(Pact.MBDBS.counts) Pact.MBDBS.counts$method <- "MBDBS" colnames(Pact.MBDBS.counts)[2] <- "type" Pact.CpG.counts.absent <- rbind(Pact.WGBS.counts, Pact.RRBS.counts, Pact.MBDBS.counts) Pact.CpG.counts.absent$plusone <- log10(Pact.CpG.counts.absent$mean.TPM+1) Pact.WGBS.RelFreq <- filter(Pact.CpG.counts.absent, method =="WGBS") Pact.RRBS.RelFreq <- filter(Pact.CpG.counts.absent, method =="RRBS") Pact.MBDBS.RelFreq <- filter(Pact.CpG.counts.absent, method =="MBDBS") # Plotting Distributions pdf("Output/Pact.gene_expression_present.pdf", width=8.5, height=6) PactEP <- plot_multi_histogram(Pact.CpG.counts.present, 'plusone', 'method') dev.off() pdf("Output/Pact.gene_expression_absent.pdf", width=8.5, height=6) PactEA <- plot_multi_histogram(Pact.CpG.counts.absent, 'plusone', 'method') dev.off() plot_grid(McapEP, PactEP, McapEA, PactEA, ncol=2, nrow=2, align="v", labels = c('A) M. Capitata', 'B) P. acuta', '', ''), label_size = 12) ggsave("Output/gene_expression_distributions.pdf", width=12, height=6) ``` ```{r} #plotting both species together plot_grid(Mcap.PresAbs, Pact.PresAbs, ncol=1, nrow=2, align="v", labels = c('A) M. Capitata', 'B) P. acuta', '', ''), label_size = 12) ggsave("Output/PresAbs_heatmaps.pdf", width=12, height=6) ``` ```{r} #GO Term union df Pact.GO.ALL <- left_join(Pact.WGBS.GO.terms, Pact.RRBS.GO.terms, by="v1", all=TRUE) Pact.GO.ALL <- left_join(Pact.GO.ALL , Pact.MBDBS.GO.terms, by="v1", all=TRUE) Pact.WGBS.Ids <- na.omit(as.character(Pact.WGBS.GO.terms$v2)) Pact.WGBS.Collection <- GOCollection(Pact.WGBS.Ids) Pact.WGBS.slims <- goSlim(Pact.WGBS.Collection, slim, "BP", evidenceCode="TAS") Pact.WGBS.slims <- Pact.WGBS.slims[order(Pact.WGBS.slims$Term),] Pact.WGBS.slims$method <- "WGBS" Pact.WGBS.slims <- Pact.WGBS.slims[-5,] Pact.RRBS.Ids <- na.omit(as.character(Pact.RRBS.GO.terms$v2)) Pact.RRBS.Collection <- GOCollection(Pact.RRBS.Ids) Pact.RRBS.slims <- goSlim(Pact.RRBS.Collection, slim, "BP", evidenceCode="TAS") Pact.RRBS.slims <- Pact.RRBS.slims[order(Pact.RRBS.slims$Term),] Pact.RRBS.slims$method <- "RRBS" Pact.RRBS.slims <- Pact.RRBS.slims[-5,] Pact.MBDBS.Ids <- na.omit(as.character(Pact.MBDBS.GO.terms$v2)) Pact.MBDBS.Collection <- GOCollection(Pact.MBDBS.Ids) Pact.MBDBS.slims <- goSlim(Pact.MBDBS.Collection, slim, "BP", evidenceCode="TAS") Pact.MBDBS.slims <- Pact.MBDBS.slims[order(Pact.MBDBS.slims$Term),] Pact.MBDBS.slims$method <- "MBDBS" Pact.MBDBS.slims <- Pact.MBDBS.slims[-5,] Pact.All.slims <- rbind(Pact.WGBS.slims,Pact.RRBS.slims,Pact.MBDBS.slims) #plot percent goslim per method pdf("Output/Pact_GOSlim_compare.pdf") ggplot(Pact.All.slims, aes(fill=method, y=Percent, x=Term)) + geom_bar(position="dodge", stat="identity") + scale_fill_manual(values=cat.cols) + theme(axis.text.x = element_text(angle = 90, size=4)) + theme(legend.position = "none") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black")) dev.off() ``` #heatmap plotting of genes identified as present or absent in each method ```{r} plot_grid(Mcap.genepresence.heatmap, Pact.genepresence.heatmap, ncol=2, align="v") ggsave("Output/gene_presence.pdf", width=8.5, height=6) ``` ```{r} WGBS.genes.circ <- as.data.frame(WGBS.genes) WGBS.genes.circ$method <- "WGBS" colnames(WGBS.genes.circ) <- c("gene", "method") RRBS.genes.circ <- as.data.frame(RRBS.genes) RRBS.genes.circ$method <- "RRBS" colnames(RRBS.genes.circ) <- c("gene", "method") MBDBS.genes.circ <- as.data.frame(MBDBS.genes) MBDBS.genes.circ$method <- "MBDBS" colnames(MBDBS.genes.circ) <- c("gene", "method") All.circ <- left_join(Mcap.genes, WGBS.genes.circ, by="gene", keep=TRUE) colnames(All.circ)[4] <- "gene" All.circ <- left_join(All.circ, RRBS.genes.circ, by="gene", keep=TRUE) colnames(All.circ)[4] <- "gene" All.circ <- left_join(All.circ, MBDBS.genes.circ, by="gene", keep=TRUE) colnames(All.circ) <- c("scaffold","gene.start", "gene.stop", "gene","WGBS","method.WGBS", "RRBS", "method.RRBS", "MBDBS", "method.MBDBS" ) All.circ.df <- arrange(All.circ,scaffold, gene.start) All.circ.data <- All.circ.df[,c(1:5,7,9)] str(All.circ.data) All.circ.data$WGBS <- as.character(as.factor(All.circ.data$WGBS)) All.circ.data$RRBS <- as.character(as.factor(All.circ.data$RRBS)) All.circ.data$MBDBS <- as.character(as.factor(All.circ.data$MBDBS)) str(All.circ.data) All.circ.data[is.na(All.circ.data)] <- 0 #change NA to 0 All.circ.data$WGBS[All.circ.data$WGBS != 0] <- 1 #change gene to 1 All.circ.data$RRBS[All.circ.data$RRBS != 0] <- 1 #change gene to 1 All.circ.data$MBDBS[All.circ.data$MBDBS != 0] <- 1 #change gene to 1 str(All.circ.data) All.circ.data$WGBS <- as.numeric(as.character(All.circ.data$WGBS)) All.circ.data$RRBS <- as.numeric(as.character(All.circ.data$RRBS)) All.circ.data$MBDBS <- as.numeric(as.character(All.circ.data$MBDBS)) str(All.circ.data) All.circ.mat <- as.matrix(All.circ.data[1:10000,5:7]) rownames(All.circ.mat) <- All.circ.data$gene[1:10000] str(All.circ.mat) All.circ.fac <- All.circ.data[1:10000,1:4] str(All.circ.fac) ``` ```{r} #heatmap pdf("Output/Mcap_heatmap_genes_present.pdf") pheatmap(All.circ.mat, color=c("black","yellow"), scale="none",show_rownames =FALSE, fontsize_row = 4, cluster_cols = FALSE, show_colnames=TRUE, cluster_rows = TRUE) #plot heatmap dev.off() ``` ```{r} # Ribbon chord diagram linking genes WGBSvRRBS <- intersect(WGBS.genes$X.gene_id, RRBS.genes$X.gene_id) WGBSvMBDBS <- intersect(WGBS.genes$X.gene_id, MBDBS.genes$X.gene_id) RRBSvMBDBS <- intersect(RRBS.genes$X.gene_id, MBDBS.genes$X.gene_id) WGBSvRRBS <- as.data.frame(WGBSvRRBS) WGBSvRRBS$from <- "WGBS" WGBSvRRBS$to <- "RRBS" WGBSvRRBS$value <- 1 colnames(WGBSvRRBS) <- c("gene", "from","to", "value") WGBSvRRBS.circ <- left_join(Mcap.genes, WGBSvRRBS, by="gene", keep=TRUE) WGBSvRRBS.circ$from <- "WGBS" WGBSvRRBS.circ$to <- "RRBS" WGBSvRRBS.circ$value[is.na(WGBSvRRBS.circ$value)] <- 0 #change NA to 0 WGBSvMBDBS <- as.data.frame(WGBSvMBDBS) WGBSvMBDBS$from <- "WGBS" WGBSvMBDBS$to <- "MBDBS" WGBSvMBDBS$value <- 1 colnames(WGBSvMBDBS) <- c("gene", "from","to", "value") WGBSvMBDBS.circ <- left_join(Mcap.genes, WGBSvMBDBS, by="gene", keep=TRUE) WGBSvMBDBS.circ$from <- "WGBS" WGBSvMBDBS.circ$to <- "MBDBS" WGBSvMBDBS.circ$value[is.na(WGBSvMBDBS.circ$value)] <- 0 #change NA to 0 RRBSvMBDBS <- as.data.frame(RRBSvMBDBS) RRBSvMBDBS$from <- "RRBS" RRBSvMBDBS$to <- "MBDBS" RRBSvMBDBS$value <- 1 colnames(RRBSvMBDBS) <- c("gene", "from","to", "value") RRBSvMBDBS.circ <- left_join(Mcap.genes, RRBSvMBDBS, by="gene", keep=TRUE) RRBSvMBDBS.circ$from <- "RRBS" RRBSvMBDBS.circ$to <- "MBDBS" RRBSvMBDBS.circ$value[is.na(RRBSvMBDBS.circ$value)] <- 0 #change NA to 0 RRBSvMBDBS.circ <- na.omit(RRBSvMBDBS.circ) #index <- seq 1:nrows #All.circ.data <- rbind(WGBSvRRBS.circ[1:100,], WGBSvMBDBS.circ[1:100,],RRBSvMBDBS.circ[1:100,]) ``` ```{r} #heatmap ``` ```{r} WGBS.genes WGBS.genes$index WGBS.genes$method RRBS.genes MBDBS.genes ``` ```{r} pdf("Output/circosplot.pdf", width=4.40945, height=4.40945) par(mfrow=c(1,1),mar=c(0,0,0,0), lwd=0.5) circos.clear() circos.par(start.degree=120, gap.degree=c(10, 10), cell.padding=c(0,0,0,0), track.margin=c(0,0)) circos.initialize(factors=pools, x=circdat$index, xlim=cbind(rep(0, 2), table(pools))) ``` ```{r} All.circ.data <- rbind(WGBSvRRBS.circ[1:100,], WGBSvMBDBS.circ[1:100,],RRBSvMBDBS.circ[1:100,]) #All.circ.data$from <- paste0(All.circ.data$gene.x, "_", All.circ.data$from) #All.circ.data$to <- paste0(All.circ.data$gene.x, "_", All.circ.data$to) All.circ.df <- All.circ.data[,6:8] str(All.circ.df) #All.circ.df$value[All.circ.df$value==0] <- NA #str(All.circ.df) #All.circ.df <- na.omit(All.circ.df) pdf("Output/chord_genes_w_CPG_by_Method.pdf") chordDiagram(All.circ.df) dev.off() ``` ```{r} WGBS.genes.circ <- as.data.frame(WGBS.genes) WGBS.genes.circ$method <- "WGBS" colnames(WGBS.genes.circ) <- c("gene", "method") RRBS.genes.circ <- as.data.frame(RRBS.genes) RRBS.genes.circ$method <- "RRBS" colnames(RRBS.genes.circ) <- c("gene", "method") MBDBS.genes.circ <- as.data.frame(MBDBS.genes) MBDBS.genes.circ$method <- "MBDBS" colnames(MBDBS.genes.circ) <- c("gene", "method") All.circ <- left_join(Mcap.genes, WGBS.genes.circ, by="gene", keep=TRUE) colnames(All.circ)[4] <- "gene" All.circ <- left_join(All.circ, RRBS.genes.circ, by="gene", keep=TRUE) colnames(All.circ)[4] <- "gene" All.circ <- left_join(All.circ, MBDBS.genes.circ, by="gene", keep=TRUE) colnames(All.circ) <- c("scaffold","gene.start", "gene.stop", "gene","WGBS","method.WGBS", "RRBS", "method.RRBS", "MBDBS", "method.MBDBS" ) All.circ.df <- arrange(All.circ,scaffold, gene.start) All.circ.data <- All.circ.df[,c(1:5,7,9)] str(All.circ.data) All.circ.data$WGBS <- as.character(as.factor(All.circ.data$WGBS)) All.circ.data$RRBS <- as.character(as.factor(All.circ.data$RRBS)) All.circ.data$MBDBS <- as.character(as.factor(All.circ.data$MBDBS)) str(All.circ.data) All.circ.data[is.na(All.circ.data)] <- 0 #change NA to 0 All.circ.data$WGBS[All.circ.data$WGBS != 0] <- 1 #change gene to 1 All.circ.data$RRBS[All.circ.data$RRBS != 0] <- 1 #change gene to 1 All.circ.data$MBDBS[All.circ.data$MBDBS != 0] <- 1 #change gene to 1 All.circ.data$WGBSvRRBS <- as.numeric(All.circ.data$WGBS)-as.numeric(All.circ.data$RRBS) #gene, from, to, value # The arguments to spread(): # - data: Data object # - key: Name of column containing the new column names # - value: Name of column containing values x <- pivot_longer(All.circ.data, cols=gene) #Then reshape to long so that there is a to, from, and 1 or 0 column for each gene #allCpGgenes <- left_join(WGBS.GO.terms, RRBS.GO.terms, by="v1", keep=TRUE) #colnames(allCpGgenes) <-c() #allCpGgenes <- left_join(allCpGgenes, MBDBS.GO.terms, by="v1", keep=TRUE) set.seed(999) mat = matrix(sample(18, 18), 3, 6) rownames(mat) = paste0("S", 1:3) colnames(mat) = paste0("E", 1:6) mat chordDiagram(mat) ``` ```{r} #GO Term union df GO.ALL <- left_join(WGBS.GO.terms, RRBS.GO.terms, by="v1", all=TRUE) GO.ALL <- left_join(GO.ALL , MBDBS.GO.terms, by="v1", all=TRUE) # GO Slim fl <- "http://www.geneontology.org/ontology/subsets/goslim_generic.obo" slim <- getOBOCollection(fl) WGBS.Ids <- na.omit(as.character(WGBS.GO.terms$v2)) WGBS.Collection <- GOCollection(WGBS.Ids) WGBS.slims <- goSlim(WGBS.Collection, slim, "BP", evidenceCode="TAS") WGBS.slims <- WGBS.slims[order(WGBS.slims$Term),] WGBS.slims$method <- "WGBS" WGBS.slims <- WGBS.slims[-5,] RRBS.Ids <- na.omit(as.character(RRBS.GO.terms$v2)) RRBS.Collection <- GOCollection(RRBS.Ids) RRBS.slims <- goSlim(RRBS.Collection, slim, "BP", evidenceCode="TAS") RRBS.slims <- RRBS.slims[order(RRBS.slims$Term),] RRBS.slims$method <- "RRBS" RRBS.slims <- RRBS.slims[-5,] MBDBS.Ids <- na.omit(as.character(MBDBS.GO.terms$v2)) MBDBS.Collection <- GOCollection(MBDBS.Ids) MBDBS.slims <- goSlim(MBDBS.Collection, slim, "BP", evidenceCode="TAS") MBDBS.slims <- MBDBS.slims[order(MBDBS.slims$Term),] MBDBS.slims$method <- "MBDBS" MBDBS.slims <- MBDBS.slims[-5,] All.slims <- rbind(WGBS.slims,RRBS.slims,MBDBS.slims) cat.cols <- c("springgreen4","orangered1", "skyblue3") #plot percent goslim per method pdf("Output/GOSlim_compare.pdf") ggplot(All.slims, aes(fill=method, y=Percent, x=Term)) + geom_bar(position="dodge", stat="identity") + scale_fill_manual(values=cat.cols) + theme(axis.text.x = element_text(angle = 90, size=4)) + theme(legend.position = "none") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black")) dev.off() ```