---
title: "Median Methylation"
author: "Yaamini Venkataraman"
date: "7/9/2020"
output: html_document
---

In this script I will calculate median methylation of genes with 5x coverage in a sample.

# Set up R Markdown Document

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

# Session information

```{r}
sessionInfo()
```

# *M. capitata*

```{bash}
mkdir ../analyses/Characterizing-CpG-Methylation-5x/Mcap/Median-Methylation-Calculations #Create directory for output
```

## Associate CpG loci with genes

The overlap files generated in [this Jupyter notebook](https://github.com/hputnam/Meth_Compare/blob/master/scripts/Characterizing-CpG-Methylation-5x.ipynb) are missing gene ID information. I can get that from the previously-generated gene track previously generated using `intersectBed`.

```{bash}
head ../genome-feature-files/Mcap.GFFannotation.gene.gff #Look at format of Mcap gene track. Gene ID is the last column
```

```{bash}
#For each sample-gene overlap file (ends in *bedgraph.bed-mcGenes)
#Use intersectBed to find where loci and genes intersect, allowing loci to be mapped to genes
#wb: Print all lines in the second file
#a: sample-gene overlap file
#b: gene track
#Save output in a new file that has the same base name and ends in -geneID

for f in ../analyses/Characterizing-CpG-Methylation-5x/Mcap/*bedgraph.bed-mcGenes
do
  /usr/local/bin/intersectBed \
  -wb \
  -a ${f} \
  -b ../genome-feature-files/Mcap.GFFannotation.gene.gff \
  > ${f}-geneID
done
```

```{bash}
find ../analyses/Characterizing-CpG-Methylation-5x/Mcap/*geneID | wc -l #Confirm that 9 files were created (one per sample)
```

```{bash}
mv ../analyses/Characterizing-CpG-Methylation-5x/Mcap/*geneID ../analyses/Characterizing-CpG-Methylation-5x/Mcap/Median-Methylation-Calculations #Move file to subdirectory for median methylation calculations
```

```{bash}
head ../analyses/Characterizing-CpG-Methylation-5x/Mcap/Median-Methylation-Calculations/Meth10_R1_001_val_1_bismark_bt2_pe._5x.bedgraph.bed-mcGenes-geneID #Confirm files are in the new subdirectory and look at output
```

## Import sample overlaps with gene ID information

```{r}
setwd("../analyses/Characterizing-CpG-Methylation-5x/Mcap/Median-Methylation-Calculations") #Set working directory within the notebook chunk for list.files to find the necessary files
filesToImport <- list.files(pattern = "*geneID") #Create a file list for all 9 files to import. Only import overlaps for full samples (not divided by methylation status)
list2env(lapply(setNames(filesToImport,
                         make.names(gsub("_R1_001_val_1_bismark_bt2_pe._5x.bedgraph.bed-mcGenes-geneID", "", filesToImport))),
                read.delim, header = FALSE),
         envir = .GlobalEnv) #Import files to the .GlobalEnv with list2env. Use lapply to setNames of the files by taking all the common parts of their names out. Read files with read.delim and include header = FALSE. Files will be named Meth#
head(Meth10) #Confirm import
```

## Format dataframes

```{r}
samplesMcap <- c("Meth10",
                 "Meth11",
                 "Meth12",
                 "Meth13",
                 "Meth14",
                 "Meth15",
                 "Meth16",
                 "Meth17",
                 "Meth18") #Create a vector of sample names
```

```{r}
for(sample in samplesMcap) { #For each sample listed in samplesMcap
  sample.tmp <- get(sample) #Extract sample based on vector contents
  sample.tmp <- sample.tmp[,-c(5:12)] #Remove extraneous columns
  colnames(sample.tmp) <- c("chr", "start", "stop", "percentMeth", "geneID") #Rename columns
  assign(sample, sample.tmp) #Replace sample with edited sample.tmp contents
}
head(Meth17) #Confirm formatting changes
```

## Calculate median methylation by `geneID`

```{r}
for(sample in samplesMcap) { #For each sample listed in samplesMcap
  sample.tmp <- get(sample) #Extract sample based on vector contents
  sample.tmp <- aggregate(percentMeth ~ geneID, data = sample.tmp, FUN = median) #Use aggregate to group geneID and calculate median percent methylation
  assign(sample, sample.tmp) #Replace sample with edited sample.tmp contents
}
head(Meth17) #Confirm median methylation calculation
```

```{r}
for (i in 1:length(samplesMcap)) { #For each sample listed in samplesMcap
  sample <- get(samplesMcap[i]) #Extract sample based on vector contents
  fileName <- paste("../analyses/Characterizing-CpG-Methylation-5x/Mcap/Median-Methylation-Calculations/", samplesMcap[i], "-Median-Methylation", ".txt", sep = "") #Assign filename for each sample
  write.table(sample, file = fileName, sep = "\t", row.names = FALSE, col.names = TRUE) #Write out files into the Median-Methylation-Calculations subdirectory
}
```

# *P. acuta*

```{bash}
mkdir ../analyses/Characterizing-CpG-Methylation-5x/Pact/Median-Methylation-Calculations #Create directory for output
```

## Associate CpG loci with genes

```{bash}
head ../genome-feature-files/Pact.GFFannotation.Genes.gff #Look at format of Pact gene track. Gene ID is the last column
```

```{bash}
#For each sample-gene overlap file (ends in *bedgraph.bed-paGenes)
#Use intersectBed to find where loci and genes intersect, allowing loci to be mapped to genes
#wb: Print all lines in the second file
#a: sample-gene overlap file
#b: gene track
#Save output in a new file that has the same base name and ends in -geneID

for f in ../analyses/Characterizing-CpG-Methylation-5x/Pact/*bedgraph.bed-paGenes
do
  /usr/local/bin/intersectBed \
  -wb \
  -a ${f} \
  -b ../genome-feature-files/Pact.GFFannotation.Genes.gff \
  > ${f}-geneID
done
```

```{bash}
find ../analyses/Characterizing-CpG-Methylation-5x/Pact/*geneID | wc -l #Confirm that 9 files were created (one per sample)
```

```{bash}
mv ../analyses/Characterizing-CpG-Methylation-5x/Pact/*geneID ../analyses/Characterizing-CpG-Methylation-5x/Pact/Median-Methylation-Calculations #Move file to subdirectory for median methylation calculations
```

```{bash}
head ../analyses/Characterizing-CpG-Methylation-5x/Pact/Median-Methylation-Calculations/Meth1_R1_001_val_1_bismark_bt2_pe._5x.bedgraph.bed-paGenes-geneID #Confirm files are in the new subdirectory and look at output
```

## Import sample overlaps with gene ID information

```{r}
setwd("../analyses/Characterizing-CpG-Methylation-5x/Pact/Median-Methylation-Calculations") #Set working directory within the notebook chunk for list.files to find the necessary files
filesToImport <- list.files(pattern = "*geneID") #Create a file list for all 9 files to import. Only import overlaps for full samples (not divided by methylation status)
list2env(lapply(setNames(filesToImport,
                         make.names(gsub("_R1_001_val_1_bismark_bt2_pe._5x.bedgraph.bed-paGenes-geneID", "", filesToImport))),
                read.delim, header = FALSE),
         envir = .GlobalEnv) #Import files to the .GlobalEnv with list2env. Use lapply to setNames of the files by taking all the common parts of their names out. Read files with read.delim and include header = FALSE. Files will be named Meth#
head(Meth1) #Confirm import
```

## Format dataframes

```{r}
samplesPact <- c("Meth1",
                 "Meth2",
                 "Meth3",
                 "Meth4",
                 "Meth5",
                 "Meth6",
                 "Meth7",
                 "Meth8",
                 "Meth9") #Create a vector of sample names
```

```{r}
for(sample in samplesPact) { #For each sample listed in samplesPact
  sample.tmp <- get(sample) #Extract sample based on vector contents
  sample.tmp <- sample.tmp[,-c(5:12)] #Remove extraneous columns
  colnames(sample.tmp) <- c("chr", "start", "stop", "percentMeth", "geneID") #Rename columns
  assign(sample, sample.tmp) #Replace sample with edited sample.tmp contents
}
head(Meth7) #Confirm formatting changes
```

## Calculate median methylation by `geneID`

```{r}
for(sample in samplesPact) { #For each sample listed in samplesPact
  sample.tmp <- get(sample) #Extract sample based on vector contents
  sample.tmp <- aggregate(percentMeth ~ geneID, data = sample.tmp, FUN = median) #Use aggregate to group geneID and calculate median percent methylation
  assign(sample, sample.tmp) #Replace sample with edited sample.tmp contents
}
head(Meth7) #Confirm median methylation calculation
```

```{r}
for (i in 1:length(samplesPact)) { #For each sample listed in samplesPact
  sample <- get(samplesPact[i]) #Extract sample based on vector contents
  fileName <- paste("../analyses/Characterizing-CpG-Methylation-5x/Pact/Median-Methylation-Calculations/", samplesPact[i], "-Median-Methylation", ".txt", sep = "") #Assign filename for each sample
  write.table(sample, file = fileName, sep = "\t", row.names = FALSE, col.names = TRUE) #Write out files into the Median-Methylation-Calculations subdirectory
}
```