--- title: "40-gene-methylation" output: html_document --- Lets see if we can get gene methylation with a 10x bed file.. and gene gff via intersect bed... ## NOTE: Due to the large file sizes generated in this Rmd file, numerous intermediate files cannot ## be uploaded to GitHub. Thus, each user will have to run this Rmd file themselves. ``` #Use intersectBed to find where loci and genes intersect, allowing loci to be mapped to annotated genes #wb: Print all lines in the second file #a: file that ends in posOnly #b: annotated gene list #Save output in a new file that has the same base name and ends in -Annotated.txt for f in RAnalysis/data/*F_R1_val_1_10x.tab do intersectBed \ -wb \ -a ${f} \ -b RAnalysis/data/C_virginica-3.0_Gnomon_genes.bed \ > ${f}_gene done ``` # Load libraries ```{r load-libraries} library("tidyverse") library("Rfast") # For row-wise coeeficient of variation calcs library("RColorBrewer") library("ggExtra") ``` ## Set variables ```{r set-variables} # Vectors for subsetting samples by different groups all <- c("S13M", "S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S64M", "S6M", "S76F", "S7M", "S12M", "S22F", "S23M", "S29F", "S31M", "S35F", "S36F", "S3F", "S41F", "S48M", "S50F", "S59M", "S77F", "S9M") controls <- c("S13M", "S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S64M", "S6M", "S76F", "S7M") exposed <- c("S12M", "S22F", "S23M", "S29F", "S31M", "S35F", "S36F", "S3F", "S41F", "S48M", "S50F", "S59M", "S77F", "S9M") controls.males <- c("S13M", "S64M", "S6M", "S7M") exposed.males <- c("S12M", "S23M", "S31M", "S48M", "S59M", "S9M") controls.females <- c("S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S76F") exposed.females <- c("S22F", "S29F", "S35F", "S36F", "S3F", "S41F", "S50F", "S77F") females <- c(controls.females, exposed.females) males <- c(controls.males, exposed.males) # Declare list for storing methylation dataframes mean.gene.methylation.CoV.list <- list() # Vector of treatment comparisons comparisons <- c("all", "females_v_males", "controls_v_exposed", "controls.females_v_exposed.females", "controls.males_v_exposed.males", "controls.females_v_controls.males", "exposed.females_v_exposed.males" ) ``` # Download bedgraph files ```{bash download-bedgraphs} mkdir --parents ../data/big cd ../data/big wget -r \ --no-check-certificate \ --continue \ --quiet \ --no-directories --no-parent \ -P . \ -A R1_val_1_10x.bedgraph \ https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/120321-cvBS/ ``` # Examine data files ```{bash glance-at-bedgraph-format} head ../data/big/3F_R1_val_1_10x.bedgraph ``` ```{bash glance-at-genes-BED} head ../genome-features/C_virginica-3.0_Gnomon_genes.bed ``` # Test intersectBed command ```{bash} NAME=5F /home/shared/bedtools2/bin/intersectBed \ -wb \ -a ../data/big/3F_R1_val_1_10x.bedgraph \ -b ../genome-features/C_virginica-3.0_Gnomon_genes.bed \ | awk -v name=$NAME -v OFS="\t" '{ print $0, name}' \ | head ``` # Run intersectBed on all files ```{bash run-intersectBed-on-all-files} cd ../data/big/ FILES=$(ls *bedgraph) cd - for file in ${FILES} do NAME=$(echo ${file} | awk -F "_" '{print $1}') echo ${NAME} /home/shared/bedtools2/bin/intersectBed \ -wb \ -a ../data/big/${NAME}_R1_val_1_10x.bedgraph \ -b ../genome-features/C_virginica-3.0_Gnomon_genes.bed \ | awk -v name=$NAME -v OFS="\t" '{ print $0, name}' \ > ../output/40-gene-methylation/${NAME}_mGene.out done ``` ## Glance at example output ```{bash glance-at-example-output} head ../output/40-gene-methylation/36F_mGene.out ``` # Concatenate all gene methylation files ```{bash concatenate-all-gene-methylation-files} cat ../output/40-gene-methylation/*_mGene.out > ../output/40-gene-methylation/meth_all-samples.out ``` # Read in gene methylation file ```{r} meth_all <- read.delim("../output/40-gene-methylation/meth_all-samples.out", header = FALSE) ``` ## View gene methylation dataframe ```{r view-gene-methylation-dataframe} meth_all ``` # Plot methylation distribution ```{r plot-methylation-disctribution} ggplot() + geom_histogram(data = meth_all, aes(x = V4), fill = "blue", alpha = 0.2) ``` # Calculate mean methylation per gene per sample ```{r calculate-mean-methylation-per-gene-per-sample} gm <- meth_all %>% mutate(art = paste(V8, V11, sep = '_')) %>% group_by(art) %>% summarize(avg_meth = mean(V4)) ``` ## View mean methylation per gene per sample ```{r view-mean-methylation-per-gene-per-sample} gm ``` ```{r} mc <- inner_join(gm, ic, by = "art") %>% separate(art, into = c("gene", "sample"), sep = "_") %>% separate(sample, into = c("number", "sex"), sep = -1) ``` ```{r, fig.width=20,fig.height=10} ggplot(mc, aes(x = avg_meth, y = isoform_count, color = sex)) + geom_point(alpha = 0.6, size = 1) + facet_wrap(~number) + geom_hline(yintercept=10) ``` ```{r, fig.width=20,fig.height=10} ggplot(mc, aes(x = avg_meth, y = isoform_count, color = sex)) + geom_point(alpha = 0.6, size = 1) + facet_wrap(~number) + stat_smooth(method = glm, method.args = list(family = binomial)) ``` ```{r} p <- ggplot(mc, aes(x = avg_meth, y = isoform_count)) + geom_point(alpha = 0.2, size = 1) + facet_wrap(~sex) ggMarginal(p, type = "histogram") ``` ```{r} ggplot(mc, aes(x = avg_meth)) + geom_histogram() + facet_wrap(~sex) ``` # Create gene methylation file where libary is row ```{r write-mean-gene-methylation-to-file} gm %>% separate(art, into = c("gene", "sample_id"), sep = '_') %>% spread(value = avg_meth, key = gene) %>% write.csv(., "../output/40-gene-methylation/40-gene-methylation.csv") ``` # Reorient gene methylation data ```{r reorient-mean-methylation-data} methylation <- gm %>% separate(art, into = c("gene", "sample_id"), sep = '_') %>% spread(value = avg_meth, key = gene) head(methylation) ``` ## Save methylation object Creating this object is useful for skipping time-consuming steps above. when needing to use this notebook in the future. Also avoids the need for high-memory computer that the previous steps above require. ```{r save-methylation-object} save(methylation, file = "../output/40-gene-methylation/methylation.RData") ``` ## Prepare methylation file for joining with BED ```{r prep-methylation-file-for-joining-with-BED} # Need to transpose to get gene names as rows # For joining with BED methylation.transposed <- as.data.frame(t(methylation)) # Convert rownames to a column of data methylation.transposed.rownames <- rownames_to_column(methylation.transposed) # # Convert first row to header names(methylation.transposed.rownames) <- methylation.transposed.rownames[1, ] methylation.transposed.rownames <- methylation.transposed.rownames[-1, ] # Replace "." in gene names with "-" # Will match gene names in BED file methylation.transposed.rownames$sample_id <- str_replace_all(methylation.transposed.rownames$sample_id, "\\.", "-") # Rename sample name columns to match sample grouping vectors # Capture column names # Took lazy approach and applied to all columns instead of subsetting desired columns original_cols <- colnames(methylation.transposed.rownames) # Append "S" to beginning of all samples names colnames(methylation.transposed.rownames) <- paste("S", original_cols, sep = "") # Replace "Ssample_id" with "name" to match BED file names(methylation.transposed.rownames)[names(methylation.transposed.rownames) == "Ssample_id"] <- "name" # Capture gene names gene.names <- pull(methylation.transposed.rownames, name) head(methylation.transposed.rownames) ``` ## Convert to matrix ```{r convert-to-matrix} # Convert first column to rownames for rowcvs() compatibility - requires matrix as input methylation.transposed.rownames.rownames <- methylation.transposed.rownames[,-1] # Assign gene names to rownames - needed for rowcvs() compatibility - requires matrix as input rownames(methylation.transposed.rownames.rownames) <- methylation.transposed.rownames[,1] # Convert from "characters" to numbers methylation.transposed.rownames.rownames.converted <- sapply(methylation.transposed.rownames.rownames, as.numeric) # Coerce into matrix for use with `rowcvs()` function methylation.transposed.matrix <- data.matrix(methylation.transposed.rownames.rownames.converted, rownames.force = TRUE) head(methylation.transposed.matrix) ``` ## Calculate mean gene methylation coefficients of variation (CoV) for all samples ```{r calculate-mean-gene-methylation-coefficients-of-variation-for-all-samples} for (comparison in comparisons) { # Parse comparisons to use subsequent strings for # column labels and file naming ## Get the left side of the comparison first_comparison <- sapply( strsplit(comparison, "_"), head,1 ) ## Get the right side of the comparison second_comparison <- sapply( strsplit(comparison,"_"), tail,1 ) # Set first CoV treatment column name first_treatment_col_CoV <- paste(first_comparison, "mean.methylation.CoV", sep = ".") # Set second CoV treatment column name second_treatment_col_CoV <- paste(second_comparison, "mean.methylation.CoV", sep = ".") # Set first mean methylation treatment column name first_treatment_col_mean_methylation <- paste(first_comparison, "mean.methylation", sep = ".") # Set second mean methylation treatment column name second_treatment_col_mean_methylation <- paste(second_comparison, "mean.methylation", sep = ".") # Calculate coefficients of variation if (comparison != "all") { # Calculate coefficients of variation using rowcvs(), convert to data frame and add to list # Uses get() to convert string to name of the vector stored in "first_comparison" mean.gene.methylation.CoV.list[[comparison]][first_treatment_col_CoV] <- as.data.frame( methylation.transposed.matrix %>% subset(., TRUE, get(first_comparison)) %>% rowcvs(., unbiased = TRUE) ) # Calculate coefficients of variation using rowcvs(), convert to data frame and add to list # Uses get() to convert string to name of the vector stored in "second_comparison" mean.gene.methylation.CoV.list[[comparison]][second_treatment_col_CoV] <- as.data.frame( methylation.transposed.matrix %>% subset(., TRUE, get(second_comparison)) %>% rowcvs(., unbiased = TRUE) ) # Calculate mean gene methylation using rowMeans(), convert to data frame and add to list. # Uses get() to convert string to name of the vector stored in "first_comparison" mean.gene.methylation.CoV.list[[comparison]][first_treatment_col_mean_methylation] <- as.data.frame( methylation.transposed.matrix %>% subset(., TRUE, get(first_comparison)) %>% rowMeans(.,) ) # Calculate mean gene methylation using rowMeans(), convert to data frame and add to list. # Uses get() to convert string to name of the vector stored in "second_comparison" mean.gene.methylation.CoV.list[[comparison]][second_treatment_col_mean_methylation] <- as.data.frame( methylation.transposed.matrix %>% subset(., TRUE, get(second_comparison)) %>% rowMeans(.,) ) # Convert to data frames # Commands above create lists of lists mean.gene.methylation.CoV.list <- lapply(mean.gene.methylation.CoV.list, as.data.frame) # Calculate mean gene methylation of the comparison using rowMeans(), convert to matrix and add to list. mean.gene.methylation.CoV.list[[comparison]]$mean.methylation <- rowMeans(as.matrix(mean.gene.methylation.CoV.list[[comparison]][c(first_treatment_col_mean_methylation, second_treatment_col_mean_methylation)])) # Calculate absolute value of difference in methylation between the first and second components # of the comparison and add to list. mean.gene.methylation.CoV.list[[comparison]]$abs.delta.CoV <- abs(mean.gene.methylation.CoV.list[[comparison]][,first_treatment_col_CoV] - mean.gene.methylation.CoV.list[[comparison]][,second_treatment_col_CoV]) } else { mean.gene.methylation.CoV.list[[comparison]] <- as.data.frame(methylation.transposed.matrix %>% rowcvs(., unbiased = TRUE)) %>% rename("CoV" = 1) # Calculate mean gene methylation of the comparison using rowMeans(), convert to data frame and add to list. # Since the operation creates a data frame as output, the data frame only has one column. # Thus, using the rowMeans(.,))[, 1] operates on the first (and only) column mean.gene.methylation.CoV.list[[comparison]]$mean.methylation <- as.data.frame(methylation.transposed.matrix %>% rowMeans(.,))[, 1] } } ``` ## Save mean.gene.methylation.CoV.list as R Object Useful for quick access to list without need for processing all steps above. ```{r save-methylation-CoV-list-object} # Save as an object to avoid having to re-run all of the code which generates this list (it takes a long time to run) save(mean.gene.methylation.CoV.list, file = "../output/40-gene-methylation/mean.gene.methylation.CoV.list.RData") ``` ## Add gene names to rows for CoV data frames ```{r gene-names-to-CoV-dataframes} # Apply gene names to column mean.gene.methylation.CoV.list <- lapply(mean.gene.methylation.CoV.list, function(x){ rownames(x) <- gene.names; x } ) # Convert row names to column and assign column name mean.gene.methylation.CoV.list <- lapply(mean.gene.methylation.CoV.list, function(x){ rownames_to_column(x, var = "gene.id") } ) ``` ## Scatter plot all mean gene methylation CoV comparisons ### Colored by absolute delta CoV Since all data frames are formatted the same (gene.id, comp1.CoV, comp2.CoV, comp1.mean.methylation, comp2.mean.methylation, mean.methylation, abs.delta.CoV) we can refer to them by column numbers. HOWEVER, to do that, we need to extract the name of each data frame, using the `names()` function. IMPORTANT: These plots do _not_ plot rows containing `NA` or `NaN`!!! As such, there is a lot of missing data. ```{r scatter-plot-all-mean-gene-methylation-CoV-by-abs.delta} # Find the maximum value across all data frames max_abs.delta.CoV <- max(sapply(mean.gene.methylation.CoV.list, function(df) max(df$abs.delta.CoV, na.rm = TRUE)), na.rm = TRUE) # Set color palette # Not entirely sure how this works, as I took it from some plotting Steven was doing. myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral"))) # Set scale colors for absolute values of the delta between comparisons # Not entirely sure how this works, as I took it from some plotting Steven was doing. sc <- scale_colour_gradientn(colours = myPalette(100), limits=c(0, max_abs.delta.CoV)) # Generate scatterplots # Uses the following columns: # 2 - comp1.CoV # 3 - comp2.CoV # 7 - abs.delta.CoV (for coloration) # Skips any data frame with less than 4 columns (i.e. the `all` data frame) lapply( mean.gene.methylation.CoV.list, function(df) { if (ncol(df) > 3) { names <- names(df) df %>% ggplot(aes_string(x=names[2], y=names[3], colour=names[7])) + geom_point(alpha = 0.6, size = 3) + geom_smooth(method = "lm") + geom_abline(size=1.0,colour="red") + sc } } ) ``` ## Scatter plot all mean gene methylation CoV comparisons ### Colored by mean methylation Since all data frames are formatted the same (gene.id, comp1.CoV, comp2.CoV, comp1.mean.methylation, comp2.mean.methylation, mean.methylation, abs.delta.CoV) we can refer to them by column numbers. HOWEVER, to do that, we need to extract the name of each data frame, using the `names()` function. IMPORTANT: These plots do _not_ plot rows containing `NA` or `NaN`!!! As such, there is a lot of missing data. ```{r scatter-plot-all-mean-gene-methylation-CoV-by-mean-methylation} # Find the maximum value across all data frames max_mean.methylation <- max(sapply(mean.gene.methylation.CoV.list, function(df) max(df$mean.methylation, na.rm = TRUE)), na.rm = TRUE) # Set color palette # Not entirely sure how this works, as I took it from some plotting Steven was doing. myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral"))) # Set scale colors for absolute values of the delta between comparisons # Not entirely sure how this works, as I took it from some plotting Steven was doing. sc <- scale_colour_gradientn(colours = myPalette(100), limits=c(0, max_mean.methylation)) # Generate scatterplots # Uses the following columns: # 2 - comp1.CoV # 3 - comp2.CoV # 6 - mean.methylation (for coloration) # Skips any data frame with less than 4 columns (i.e. the `all` data frame) lapply( mean.gene.methylation.CoV.list, function(df) { if (ncol(df) > 3) { names <- names(df) df %>% ggplot(aes_string(x=names[2], y=names[3], colour=names[6])) + geom_point(alpha = 0.6, size = 3) + geom_smooth(method = "lm") + geom_abline(size=1.0,colour="red") + sc } } ) ``` ## Write CoV data frames to CSVs ```{r write-CoVs-to-CSVs} # Write data frames to CSVs in ../output/40-gene-methylation/ dir # Uses names of data frames as names of output files. sapply(names(mean.gene.methylation.CoV.list), function(x) write.csv(mean.gene.methylation.CoV.list[[x]], file = file.path("../output/40-gene-methylation/", paste(x, "CoV-mean-methylation", "csv", sep=".")), quote = FALSE, row.names = FALSE) ) ```