--- title: "Circos Meth Compare" author: "HM Putnam" date: "7/19/2020" output: html_document editor_options: chunk_output_type: console --- Setup ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` Load Libraries ```{r} library(circlize) library(tidyverse) library(ComplexHeatmap) ``` Generate Circos plots #https://jokergoo.github.io/circlize_book/book/initialize-genomic-plot.html#customize-chromosome-track #The input data for circos.genomicInitialize() is also a data frame with at least three columns. The first column is genomic category (for cytoband data, it is chromosome name), and the next two columns are positions in each genomic category. Montipora capitata ```{r} #read in the 5x coverage bedgraph of CpG for all 9 samples, 3 reps x 3 methods Mcap <- read.table("../data/Mcap_union_5x.bedgraph", sep = "\t", header = TRUE, na.strings = "N/A") head(Mcap) str(Mcap) ``` ```{r} #load in all genes gff from a genome Mcap.cytoband <- read.table("../../data/GCF_000233375.1_ICSASG_v2_genomic.gff", sep = "\t", header = FALSE) #remove extra coloumns from gene gff Mcap.cytoband.df <- Mcap.cytoband[,c(1,4,5,6,9)] str(Mcap.cytoband.df) head(Mcap.cytoband.df) Mcap.cytoband.df <- Mcap.cytoband.df[which(substr(Mcap.cytoband.df$V9,4,7) == "gene"),] #save only gene name/ID Mcap.cytoband.df$V9 <- gsub(";.*","",Mcap.cytoband.df$V9) Mcap.cytoband.df$V9 <- gsub("ID=gene-","",Mcap.cytoband.df$V9) #only keep the NC scaffolds Mcap.cytoband.df <- Mcap.cytoband.df[grep("NC",Mcap.cytoband.df$V1),] #look at distinct numner of scaffolds n_distinct(Mcap.cytoband.df$V1) Mcap.gene.cytoband <- Mcap.cytoband.df Mcap.gene.cytoband$V4 <-as.numeric(Mcap.gene.cytoband$V4) #make start numeric Mcap.gene.cytoband$V5 <- as.numeric(Mcap.gene.cytoband$V5) #make stop numeric Mcap.gene.cytoband$V6 <- as.numeric(0.5) Mcap.gene.cytoband$V9 <- as.numeric(1.0) #examine scaffolds for the number of genes in a scaffold Mcap.sizes <- Mcap.cytoband.df %>% group_by(V1) %>% summarise(count=n()) %>% arrange(-count) #Mcap.sizes[1,1] # write.csv(Mcap.sizes,"../Output/Circos_plots/genesperscaff.csv") #identify scaffold lengths Mcap.scaffold.lengths <- read.table("../../data/GCF_000233375.1_ICSASG_v2_genomic.fa.fai", sep = "\t", header = FALSE) Mcap.scaffold.lengths <- Mcap.scaffold.lengths[grep("NC",Mcap.scaffold.lengths$V1),] #keep only the first two columns Mcap.scaffold.lengths <- Mcap.scaffold.lengths[,1:2] head(Mcap.scaffold.lengths) #plot the top 10 longest scaffolds #for (i in 1:n_distinct(Mcap.cytoband.df$V1)) { #for (i in 8:24) { for (i in 26:30) { # #filter the gene information for the scaffold with the highest number of genes Mcap.gene.cytoband.perscaff <- Mcap.gene.cytoband %>% filter(V1 == Mcap.scaffold.lengths$V1[i]) #filter the CpG data for the scaffold with the highest number of genes #Mcap_union <- Mcap %>% ## filter(chrom== Mcap.scaffold.lengths$V1[i]) #str(Mcap_union) #add a top row to start the scaffold at 0 Mcap.gene.cytoband.head <- c(Mcap.scaffold.lengths$V1[i],0,1,0,1) #identify scaffold lengths Mcap.scaffold.lengths.ind <- Mcap.scaffold.lengths %>% filter(V1 == Mcap.scaffold.lengths$V1[i]) #add a bottom row to stop the scaffold at max length Mcap.gene.cytoband.tail <- c(Mcap.scaffold.lengths$V1[i],Mcap.scaffold.lengths.ind[1,2]-1,Mcap.scaffold.lengths.ind[1,2],0,1) #combine top, data, and bottom rows Mcap.gene.cytoband.perscaff <- rbind(Mcap.gene.cytoband.head, Mcap.gene.cytoband.perscaff) Mcap.gene.cytoband.perscaff <- rbind(Mcap.gene.cytoband.perscaff,Mcap.gene.cytoband.tail) str(Mcap.gene.cytoband.perscaff) Mcap.gene.cytoband.perscaff$V4 <- as.numeric(Mcap.gene.cytoband.perscaff$V4) Mcap.gene.cytoband.perscaff$V5 <- as.numeric(Mcap.gene.cytoband.perscaff$V5) Mcap.gene.cytoband.perscaff$V6 <- as.numeric(Mcap.gene.cytoband.perscaff$V6) Mcap.gene.cytoband.perscaff$V9 <- as.numeric(Mcap.gene.cytoband.perscaff$V9) #scale the data from 0-1 Mcap_union.WGBS <- read.table("../../data/5x_bed/16C_26psu_5x-averages.bedgraph",sep = "\t", header = FALSE) Mcap_union.RRBS <- read.table("../../data/5x_bed/16C_32psu_5x-averages.bedgraph",sep = "\t", header = FALSE) Mcap_union.MBDBS <- read.table("../../data/5x_bed/8C_26psu_5x-averages.bedgraph",sep = "\t", header = FALSE) Mcap_union.MBDBS2 <- read.table("../../data/5x_bed/8C_32psu_5x-averages.bedgraph",sep = "\t", header = FALSE) # Mcap_union.WGBS <- Mcap_union %>% mutate(avg = rowMeans(.[4:6], na.rm=TRUE)) # Mcap_union.WGBS <- Mcap_union.WGBS[,c(1,2,3,13)] # Mcap_union.WGBS$avg <- Mcap_union.WGBS$avg/100 # Mcap_union.WGBS$avg[is.nan(Mcap_union.WGBS$avg)] <- NA # Mcap_union.RRBS <- Mcap_union %>% mutate(avg = rowMeans(.[7:9], na.rm=TRUE)) # Mcap_union.RRBS <- Mcap_union.RRBS[,c(1,2,3,13)] # Mcap_union.RRBS$avg <- Mcap_union.RRBS$avg/100 # Mcap_union.RRBS$avg[is.nan(Mcap_union.RRBS$avg)] <- NA # Mcap_union.MBDBS <- Mcap_union %>% mutate(avg = rowMeans(.[10:12], na.rm=TRUE)) # Mcap_union.MBDBS <- Mcap_union.MBDBS[,c(1,2,3,13)] # Mcap_union.MBDBS$avg <- Mcap_union.MBDBS$avg/100 # Mcap_union.MBDBS$avg[is.nan(Mcap_union.MBDBS$avg)] <- NA #i = 1 pdf(file = paste0("../20210629_circos/",i,"Mcap_Circos_scaff",Mcap.sizes$V1[i],"_top10scaffsbylength.pdf"), width=6, height=10) #set plotting parameters for pdf par(mfrow=c(1,1)) #clear circos.clear() #Montipora set.seed(123) #set plotting parameters for circlize circos.par("track.height" = 0.4, start.degree = 90, gap.degree = 20, cell.padding = c(0.01,1,0.01,1)) #add outer scaffold band circos.initializeWithIdeogram(Mcap.gene.cytoband.perscaff, species = NULL, sort.chr = TRUE) #add data track circos.genomicTrack(Mcap_union.WGBS, stack=FALSE, ylim=c(0,1), track.height = 0.1, panel.fun = function(region, value, ...) { circos.genomicPoints(region, value, pch = 16, cex = 0.1, col="#31A354") }) #labal y axis of data track circos.yaxis(side="left", labels = TRUE, tick=TRUE, at=c(0,0.5,1)) # #add data track circos.genomicTrack(Mcap_union.RRBS, stack=FALSE, ylim=c(0,1), track.height = 0.1, panel.fun = function(region, value, ...) { circos.genomicPoints(region, value, pch = 16, cex = 0.1, col="#756BB1") }) # #labal y axis of data track circos.yaxis(side="left", labels = TRUE, tick=TRUE, at=c(0,0.5,1)) # #add data track circos.genomicTrack(Mcap_union.MBDBS, stack=FALSE, ylim=c(0,1), track.height = 0.1, panel.fun = function(region, value, ...) { circos.genomicPoints(region, value, pch = 16, cex = 0.1, col="#E6550D") }) # #labal y axis of data track circos.yaxis(side="left", labels = TRUE, tick=TRUE, at=c(0,0.5,1)) # #add data track circos.genomicTrack(Mcap_union.MBDBS2, stack=FALSE, ylim=c(0,1), track.height = 0.1, panel.fun = function(region, value, ...) { circos.genomicPoints(region, value, pch = 16, cex = 0.1, col="#808080") }) # #labal y axis of data track circos.yaxis(side="left", labels = TRUE, tick=TRUE, at=c(0,0.5,1)) #add legend lgd = Legend(at = c("16C_26psu", "16C_32psu", "8C_26psu","8C_32psu" ), legend_gp=gpar(fill=c("#31A354","#756BB1","#E6550D", "#808080"))) draw(lgd, just = c("bottom")) dev.off() } ``` Pocillopora acuta ```{r} #read in the 5x coverage bedgraph of CpG for all 9 samples, 3 reps x 3 methods Pact <- read.table("../data/Pact_union_5x.bedgraph", sep = "\t", header = TRUE, na.strings = "N/A") head(Pact) str(Pact) #load in all genes gff from a genome Pact.cytoband <- read.table("../genome-feature-files/Pact.GFFannotation.genes.gff", sep = "\t", header = FALSE) #remove extra coloumns from gene gff Pact.cytoband.df <- Pact.cytoband[,c(1,4,5,6,9)] str(Pact.cytoband.df) head(Pact.cytoband.df) #look at distinct numner of scaffolds n_distinct(Pact.cytoband$V1) #subset the columns of data Pact.gene.cytoband <- Pact.cytoband[,c(1,4,5,6,7)] Pact.gene.cytoband$V4 <- as.numeric(Pact.gene.cytoband$V4) Pact.gene.cytoband$V5 <- as.numeric(Pact.gene.cytoband$V5) Pact.gene.cytoband$V6 <- as.numeric(0.5) Pact.gene.cytoband$V7 <- as.numeric(1.0) str(Pact.gene.cytoband) #examine scaffolds for the number of genes in a scaffold Pact.sizes <- Pact.cytoband %>% group_by(V1) %>% summarise(count=n()) %>% arrange(-count) Pact.sizes[1,1] #filter the gene information for the scaffold with the highest number of genes Pact.gene.cytoband <- Pact.gene.cytoband %>% filter(V1 == "scaffold502_cov107" ) str(Pact.gene.cytoband) #filter the CpG data for the scaffold with the highest number of genes Pact_union <- Pact %>% filter(chrom== "scaffold502_cov107") str(Pact_union) #set the scaffold identifier as numeric (need to track actual scaffold id from Pact.sizes[1,1]) Pact_union$chrom <- 1 #add a top row to start the scaffold at 0 Pact.gene.cytoband.head <- c(1,0,1,0,1) #identify scaffold lengths Pact.scaffold.lengths <- read.table("../data/Pact.genome_assembly-sequence-lengths.txt", sep = "\t", stringsAsFactors=FALSE, header = FALSE) %>% filter(V1 == "scaffold502_cov107") #add a bottom row to stop the scaffold at max length Pact.gene.cytoband.tail <- c(1,Pact.scaffold.lengths[1,2]-1,Pact.scaffold.lengths[1,2],0,1) #view datafram information str(Pact.gene.cytoband) #set the scaffold identifier as numeric (need to track actual scaffold id from Pact.sizes[1,1]) Pact.gene.cytoband$V1 <- 1 #combine top, data, and bottom rows Pact.gene.cytoband <- rbind(Pact.gene.cytoband.head, Pact.gene.cytoband) str(Pact.gene.cytoband) Pact.gene.cytoband <- rbind(Pact.gene.cytoband,Pact.gene.cytoband.tail) str(Pact.gene.cytoband) #scale the data from 0-1 Pact_union.WGBS <- Pact_union %>% mutate(avg = rowMeans(.[4:6], na.rm=TRUE)) Pact_union.WGBS <- Pact_union.WGBS[,c(1,2,3,13)] Pact_union.WGBS$avg <- Pact_union.WGBS$avg/100 Pact_union.WGBS$avg[is.nan(Pact_union.WGBS$avg)] <- NA Pact_union.RRBS <- Pact_union %>% mutate(avg = rowMeans(.[7:9], na.rm=TRUE)) Pact_union.RRBS <- Pact_union.RRBS[,c(1,2,3,13)] Pact_union.RRBS$avg <- Pact_union.RRBS$avg/100 Pact_union.RRBS$avg[is.nan(Pact_union.RRBS$avg)] <- NA Pact_union.MBDBS <- Pact_union %>% mutate(avg = rowMeans(.[10:12], na.rm=TRUE)) Pact_union.MBDBS <- Pact_union.MBDBS[,c(1,2,3,13)] Pact_union.MBDBS$avg <- Pact_union.MBDBS$avg/100 Pact_union.MBDBS$avg[is.nan(Pact_union.MBDBS$avg)] <- NA ``` ```{r} #plot top scaffolds with the greatest number of gene pdf('../Output/Circos_plots/Circos_topscaff.pdf', width=6, height=10) #set plotting parameters for pdf par(mfrow=c(2,1)) #clear circos.clear() #Montipora set.seed(123) #set plotting parameters for circlize circos.par("track.height" = 0.2, start.degree = 90, gap.degree = 20, cell.padding = c(0.01,1,0.01,1)) #add outer scaffold band circos.initializeWithIdeogram(Mcap.gene.cytoband, species = NULL, sort.chr = TRUE) #add data track circos.genomicTrack(Mcap_union.WGBS, stack=FALSE, ylim=c(0,1), track.height = 0.1, panel.fun = function(region, value, ...) { circos.genomicPoints(region, value, pch = 16, cex = 0.1, col="#31A354") }) #labal y axis of data track circos.yaxis(side="left", labels = TRUE, tick=TRUE, at=c(0,0.5,1)) #add data track circos.genomicTrack(Mcap_union.RRBS, stack=FALSE, ylim=c(0,1), track.height = 0.1, panel.fun = function(region, value, ...) { circos.genomicPoints(region, value, pch = 16, cex = 0.1, col="#756BB1") }) #labal y axis of data track circos.yaxis(side="left", labels = TRUE, tick=TRUE, at=c(0,0.5,1)) #add data track circos.genomicTrack(Mcap_union.MBDBS, stack=FALSE, ylim=c(0,1), track.height = 0.1, panel.fun = function(region, value, ...) { circos.genomicPoints(region, value, pch = 16, cex = 0.1, col="#E6550D") }) #labal y axis of data track circos.yaxis(side="left", labels = TRUE, tick=TRUE, at=c(0,0.5,1)) #add legend lgd = Legend(at = c("WGBS", "RRBS", "MBDBS"), legend_gp=gpar(fill=c("#31A354","#756BB1","#E6550D"))) draw(lgd, just = c("bottom")) #Pocillopora #clear before next plot circos.clear() set.seed(123) #set plotting parameters for circlize circos.par("track.height" = 0.2, start.degree = 90, gap.degree = 20, cell.padding = c(0.01,1,0.01,1)) #add outer scaffold band circos.initializeWithIdeogram(Pact.gene.cytoband, species = NULL, sort.chr = TRUE) #add data track circos.genomicTrack(Pact_union.WGBS, stack=FALSE, ylim=c(0,1), track.height = 0.1, panel.fun = function(region, value, ...) { circos.genomicPoints(region, value, pch = 16, cex = 0.1, col="#31A354") }) #labal y axis of data track circos.yaxis(side="left", labels = TRUE, tick=TRUE, at=c(0,0.5,1)) #add data track circos.genomicTrack(Pact_union.RRBS, stack=FALSE, ylim=c(0,1), track.height = 0.1, panel.fun = function(region, value, ...) { circos.genomicPoints(region, value, pch = 16, cex = 0.1, col="#756BB1") }) #labal y axis of data track circos.yaxis(side="left", labels = TRUE, tick=TRUE, at=c(0,0.5,1)) #add data track circos.genomicTrack(Pact_union.MBDBS, stack=FALSE, ylim=c(0,1), track.height = 0.1, panel.fun = function(region, value, ...) { circos.genomicPoints(region, value, pch = 16, cex = 0.1, col="#E6550D") }) #labal y axis of data track circos.yaxis(side="left", labels = TRUE, tick=TRUE, at=c(0,0.5,1)) #add legend lgd = Legend(at = c("WGBS", "RRBS", "MBDBS"), legend_gp=gpar(fill=c("#31A354","#756BB1","#E6550D"))) draw(lgd, just = c("bottom")) dev.off() ```