---
title: "Chacterizing CpG Methylation"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

In this script, I'll create figures to characterize CpG methylation in my *C. virginica* gonad samples. I will use a list of CpGs with at least 5x coverage in any samples to look at the distribution of methylated CpGs. I'll also use individual sample bedgraphs to calculate percent methylation across genome features.

# Session information

```{r}
sessionInfo()
```

# Distribution of CpGs

## Import data

```{r}
cpgMethylation <- read.csv("2019-04-09-All-5x-CpGs.csv", header = FALSE) #Import file with CpG methylation for all loci with 5x coverage
colnames(cpgMethylation) <- c("chromosome", "start", "end", "methylation") #Add column names
head(cpgMethylation) #Confirm import
```

## Create figure

```{r}
RColorBrewer::display.brewer.all() #Show all RColorBrewer palettes. I will choose greens.
plotColors <- rev(RColorBrewer::brewer.pal(7, "GnBu")) #Create a color palette for the barplots. Use 5 green-blue shades from RColorBrewer. Reverse theorder so the darkest shade is used first.
barplot(t(t(proportionData)),
        col = plotColors) #See what plot looks like with new scheme
barplot(t(t(proportionData)),
        col = dichromat(plotColors)) #Check that the plot colors will be easy to interpret for those with color blindess
```

```{r}
#pdf("2019-04-10-5x-CpG-Frequency-Distribution.pdf", width = 11, height = 8.5)
hist(x = cpgMethylation$methylation, axes = FALSE, xlab = "", ylab = "", main = "", col = plotColors[2], xaxs = "i", yaxs = "i") #Create base plot
axis(side = 1, col = "grey80", at = seq(from = 0, to = 100, by = 10), cex.axis = 1.2) #Add x-axis
mtext(side = 1, text = "Methylation (%)", line = 3, cex = 1.5) #Add x-axis label
axis(side = 2, col = "grey80", las = 2, labels = c("0", "2", "4", "6"), at = c(0, 2e+05, 4e+05, 6e+05), cex.axis = 1.2) #add y-axis
mtext(side = 2, text = "Frequency (x100,000)", line = 2.5, cex = 1.5) #Add y-axis label
#dev.off()
```

# Percent methylation of genome features

For each genome feature, I will do the following:

1. Interect sample bedgraphs with genome feature track
2. Add percent methylation information
2. Calculate median percent methylation for the entire matrix

## Check format of previously-generated files and copy to current folder

```{bash}
head ../2019-08-14-Differentially-Methylated-Genes/zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph #Individual sample bedgraph with all information
```

```{bash}
cp ../2019-08-14-Differentially-Methylated-Genes/*5x.bedgraph .
```

```{bash}
find *5x.bedgraph
```

```{bash}
head ../2019-08-14-Differentially-Methylated-Genes/zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly #Individual sample bedgraph with chromosome, start, and end position only
```

```{bash}
cp ../2019-08-14-Differentially-Methylated-Genes/*posOnly .
```

```{bash}
find *posOnly
```

```{bash}
head ../2019-08-14-Differentially-Methylated-Genes/zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.sorted #Indvidual sample bedgraph with merged chr, start, end information to join files
```

```{bash}
cp ../2019-08-14-Differentially-Methylated-Genes/*bedgraph.sorted .
```

```{bash}
find *bedgraph.sorted
```

## Reformat global methylation file

```{bash}
head 2019-04-09-All-5x-CpGs.bedgraph #Check current file
```

```{bash}
awk '{print $1"\t"$2"\t"$3}' 2019-04-09-All-5x-CpGs.bedgraph > 2019-04-09-All-5x-CpGs.bedgraph.posOnly #Remove percent methylation information
```

```{bash}
head 2019-04-09-All-5x-CpGs.bedgraph.posOnly
```

```{bash}
#Create a version of the file that can be used to add percent methylation information back with a join command

awk '{print $1"-"$2"-"$3"\t"$1"\t"$2"\t"$3"\t"$4}' 2019-04-09-All-5x-CpGs.bedgraph \
| sort -k1,1 \
> 2019-04-09-All-5x-CpGs.bedgraph.sorted 
```

```{bash}
head 2019-04-09-All-5x-CpGs.bedgraph.sorted
```

## Putative promoters

```{bash}
head -n 1 ../2018-11-01-DML-and-DMR-Analysis/2019-05-29-Flanking-Analysis/2019-05-29-mRNA-Promoter-Track.bed #Check promoter track
```

### Run `intersectBed`

```{bash}
find *posOnly
```

```{bash}
#For each file that ends in posOnly
#Use intersectBed to find where loci and promoters intersect
#wb: Print all lines in the second file
#a: file that ends in posOnly
#b: promoter track
#Only keep specified columns
#Save output in a new file that has the same base name and ends in -promoterOverlap
for f in *posOnly
do
  /Users/Shared/bioinformatics/bedtools2/bin/intersectBed \
  -wb \
  -a ${f} \
  -b ../2018-11-01-DML-and-DMR-Analysis/2019-05-29-Flanking-Analysis/2019-05-29-mRNA-Promoter-Track.bed \
  | awk '{print $1"\t"$2"\t"$3"\t"$7"\t"$8}' \
  > ${f}-promoterOverlap
done
```

```{bash}
find *promoterOverlap
```

```{bash}
head zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-promoterOverlap #Confirm file is annotated
```

### Add back percent methylation information

```{bash}
#For each file that ends in promoterOverlap
#Print the first three columns (chr, start, end) with dashes in between, then the rest of the columns in the file
#Save the file with the base name + .sorted
for f in *promoterOverlap
do
  awk '{print $1"-"$2"-"$3"\t"$0}' ${f} \
  | sort -k1,1 \
  > ${f}.sorted
done
```

```{bash}
find *promoterOverlap.sorted
```

```{bash}
head -n 1 zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-promoterOverlap.sorted
```

```{bash}
#For each file that ends in bedgraph
#Join 2 files using the first column. The files and tab-delimited and the output should also be tab-delimited
#The first file ends with .sorted
#The second file ends with .posOnly-promoterOverlap.sorted
#Add .annotated.percentMeth to the base name of the output file
for f in *bedgraph
do
  join -j 1 -t $'\t' \
  ${f}.sorted \
  ${f}.posOnly-promoterOverlap.sorted \
  | awk '{print $2"\t"$3"\t"$4"\t"$9"\t"$10"\t"$5}' \
  > ${f}-promoterOverlap-percentMeth
done
```

```{bash}
find *promoterOverlap-percentMeth
```

```{bash}
head 2019-04-09-All-5x-CpGs.bedgraph-promoterOverlap-percentMeth
```

### Calculate median methylation

```{bash}
#For each file that ends in *promoterOverlap-percentMeth
#Sort numerically
#Calculate the median of the sixth column (percent methylation) for each file
#Save output in a new file
(for f in *promoterOverlap-percentMeth
do
  sort -n ${f} \
  | awk ' { a[i++]=$6; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }'
done) > 2020-02-25-promoterOverlap-percentMeth-Medians.txt
```

```{bash}
cat 2020-02-25-promoterOverlap-percentMeth-Medians.txt
```

## UTRs

```{bash}
head -n 1 ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_exonUTR_yrv.bed #Check promoter track
```

### Run `intersectBed`

```{bash}
#For each file that ends in posOnly
#Use intersectBed to find where loci and promoters intersect
#wb: Print all lines in the second file
#a: file that ends in posOnly
#b: promoter track
#Only keep specified columns
#Save output in a new file that has the same base name and ends in -promoterOverlap
for f in *posOnly
do
  /Users/Shared/bioinformatics/bedtools2/bin/intersectBed \
  -wb \
  -a ${f} \
  -b ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_exonUTR_yrv.bed \
    | awk '{print $1"\t"$2"\t"$3"\t"$5"\t"$6}' \
  > ${f}-UTROverlap
done
```

```{bash}
find *UTROverlap
```

```{bash}
head zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-UTROverlap #Confirm file is annotated
```

### Add back percent methylation information

```{bash}
#For each file that ends in UTROverlap
#Print the first three columns (chr, start, end) with dashes in between, then the rest of the columns in the file
#Save the file with the base name + .sorted
for f in *UTROverlap
do
  awk '{print $1"-"$2"-"$3"\t"$0}' ${f} \
  | sort -k1,1 \
  > ${f}.sorted
done
```

```{bash}
find *UTROverlap.sorted
```

```{bash}
head -n 1 zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-UTROverlap.sorted
```

```{bash}
#For each file that ends in bedgraph
#Join 2 files using the first column. The files and tab-delimited and the output should also be tab-delimited
#The first file ends with .sorted
#The second file ends with .posOnly-UTROverlap.sorted
#Add .annotated.percentMeth to the base name of the output file
for f in *bedgraph
do
  join -j 1 -t $'\t' \
  ${f}.sorted \
  ${f}.posOnly-UTROverlap.sorted \
  | awk '{print $2"\t"$3"\t"$4"\t"$9"\t"$10"\t"$5}' \
  > ${f}-UTROverlap-percentMeth
done
```

```{bash}
find *UTROverlap-percentMeth
```

```{bash}
head 2019-04-09-All-5x-CpGs.bedgraph-UTROverlap-percentMeth
```

### Calculate median methylation

```{bash}
#For each file that ends in *UTROverlap-percentMeth
#Sort numerically
#Calculate the median of the sixth column (percent methylation) for each file
#Save output in a new file
(for f in *UTROverlap-percentMeth
do
  sort -n ${f} \
  | awk ' { a[i++]=$6; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }'
done) > 2020-02-25-UTROverlap-percentMeth-Medians.txt
```

```{bash}
cat 2020-02-25-UTROverlap-percentMeth-Medians.txt
```

## Exons

```{bash}
head -n 1 ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_exon_sorted_yrv.bed #Check track
```

### Run `intersectBed`

```{bash}
#For each file that ends in posOnly
#Use intersectBed to find where loci and promoters intersect
#wb: Print all lines in the second file
#a: file that ends in posOnly
#b: exon track
#Only keep specified columns
#Save output in a new file that has the same base name and ends in -promoterOverlap
for f in *posOnly
do
  /Users/Shared/bioinformatics/bedtools2/bin/intersectBed \
  -wb \
  -a ${f} \
  -b ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_exon_sorted_yrv.bed \
    | awk '{print $1"\t"$2"\t"$3"\t"$5"\t"$6}' \
  > ${f}-exonOverlap
done
```

```{bash}
find *exonOverlap
```

```{bash}
head zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-exonOverlap #Confirm file is annotated
```

### Add back percent methylation information

```{bash}
#For each file that ends in exonOverlap
#Print the first three columns (chr, start, end) with dashes in between, then the rest of the columns in the file
#Save the file with the base name + .sorted
for f in *exonOverlap
do
  awk '{print $1"-"$2"-"$3"\t"$0}' ${f} \
  | sort -k1,1 \
  > ${f}.sorted
done
```

```{bash}
find *exonOverlap.sorted
```

```{bash}
head -n 1 zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-exonOverlap.sorted
```

```{bash}
#For each file that ends in bedgraph
#Join 2 files using the first column. The files and tab-delimited and the output should also be tab-delimited
#The first file ends with .sorted
#The second file ends with .posOnly-exonOverlap.sorted
#Add .annotated.percentMeth to the base name of the output file
for f in *bedgraph
do
  join -j 1 -t $'\t' \
  ${f}.sorted \
  ${f}.posOnly-exonOverlap.sorted \
  | awk '{print $2"\t"$3"\t"$4"\t"$9"\t"$10"\t"$5}' \
  > ${f}-exonOverlap-percentMeth
done
```

```{bash}
find *exonOverlap-percentMeth
```

```{bash}
head 2019-04-09-All-5x-CpGs.bedgraph-exonOverlap-percentMeth
```

### Calculate median methylation

```{bash}
#For each file that ends in *exonOverlap-percentMeth
#Sort numerically
#Calculate the median of the sixth column (percent methylation) for each file
#Save output in a new file
(for f in *exonOverlap-percentMeth
do
  sort -n ${f} \
  | awk ' { a[i++]=$6; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }'
done) > 2020-02-25-exonOverlap-percentMeth-Medians.txt
```

```{bash}
cat 2020-02-25-exonOverlap-percentMeth-Medians.txt
```

## Introns

```{bash}
head -n 1 ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_intron_yrv.bed #Check track
```

### Run `intersectBed`

```{bash}
#For each file that ends in posOnly
#Use intersectBed to find where loci and promoters intersect
#wb: Print all lines in the second file
#a: file that ends in posOnly
#b: exon track
#Only keep specified columns
#Save output in a new file that has the same base name and ends in -intronOverlap
for f in *posOnly
do
  /Users/Shared/bioinformatics/bedtools2/bin/intersectBed \
  -wb \
  -a ${f} \
  -b ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_intron_yrv.bed \
    | awk '{print $1"\t"$2"\t"$3"\t"$5"\t"$6}' \
  > ${f}-intronOverlap
done
```

```{bash}
find *intronOverlap
```

```{bash}
head zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-intronOverlap #Confirm file is annotated
```

### Add back percent methylation information

```{bash}
#For each file that ends in intronOverlap
#Print the first three columns (chr, start, end) with dashes in between, then the rest of the columns in the file
#Save the file with the base name + .sorted
for f in *intronOverlap
do
  awk '{print $1"-"$2"-"$3"\t"$0}' ${f} \
  | sort -k1,1 \
  > ${f}.sorted
done
```

```{bash}
find *intronOverlap.sorted
```

```{bash}
head -n 1 zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-intronOverlap.sorted
```

```{bash}
#For each file that ends in bedgraph
#Join 2 files using the first column. The files and tab-delimited and the output should also be tab-delimited
#The first file ends with .sorted
#The second file ends with .posOnly-intronOverlap.sorted
#Add .annotated.percentMeth to the base name of the output file
for f in *bedgraph
do
  join -j 1 -t $'\t' \
  ${f}.sorted \
  ${f}.posOnly-intronOverlap.sorted \
  | awk '{print $2"\t"$3"\t"$4"\t"$9"\t"$10"\t"$5}' \
  > ${f}-intronOverlap-percentMeth
done
```

```{bash}
find *intronOverlap-percentMeth
```

```{bash}
head 2019-04-09-All-5x-CpGs.bedgraph-intronOverlap-percentMeth
```

### Calculate median methylation

```{bash}
#For each file that ends in *intronOverlap-percentMeth
#Sort numerically
#Calculate the median of the sixth column (percent methylation) for each file
#Save output in a new file
(for f in *intronOverlap-percentMeth
do
  sort -n ${f} \
  | awk ' { a[i++]=$6; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }'
done) > 2020-02-25-intronOverlap-percentMeth-Medians.txt
```

```{bash}
cat 2020-02-25-intronOverlap-percentMeth-Medians.txt
```

## Transposable elements

```{bash}
head -n 7 ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_TE-all.gff #Check track
```

### Run `intersectBed`

```{bash}
#For each file that ends in posOnly
#Use intersectBed to find where loci and promoters intersect
#wb: Print all lines in the second file
#a: file that ends in posOnly
#b: exon track
#Only keep specified columns
#Save output in a new file that has the same base name and ends in -intronOverlap
for f in *posOnly
do
  /Users/Shared/bioinformatics/bedtools2/bin/intersectBed \
  -wb \
  -a ${f} \
  -b ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_TE-all.gff \
  | awk '{print $1"\t"$2"\t"$3"\t"$7"\t"$8}' \
  > ${f}-TEOverlap
done
```

```{bash}
find *TEOverlap
```

```{bash}
head zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-TEOverlap #Confirm file is annotated
```

### Add back percent methylation information

```{bash}
#For each file that ends in TEOverlap
#Print the first three columns (chr, start, end) with dashes in between, then the rest of the columns in the file
#Save the file with the base name + .sorted
for f in *TEOverlap
do
  awk '{print $1"-"$2"-"$3"\t"$0}' ${f} \
  | sort -k1,1 \
  > ${f}.sorted
done
```

```{bash}
find *TEOverlap.sorted
```

```{bash}
head -n 1 zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-TEOverlap.sorted
```

```{bash}
#For each file that ends in bedgraph
#Join 2 files using the first column. The files and tab-delimited and the output should also be tab-delimited
#The first file ends with .sorted
#The second file ends with .posOnly-TEOverlap.sorted
#Add .annotated.percentMeth to the base name of the output file
for f in *bedgraph
do
  join -j 1 -t $'\t' \
  ${f}.sorted \
  ${f}.posOnly-TEOverlap.sorted \
  | awk '{print $2"\t"$3"\t"$4"\t"$9"\t"$10"\t"$5}' \
  > ${f}-TEOverlap-percentMeth
done
```

```{bash}
find *TEOverlap-percentMeth
```

```{bash}
head 2019-04-09-All-5x-CpGs.bedgraph-TEOverlap-percentMeth
```

### Calculate median methylation

```{bash}
#For each file that ends in *TEOverlap-percentMeth
#Sort numerically
#Calculate the median of the sixth column (percent methylation) for each file
#Save output in a new file
(for f in *TEOverlap-percentMeth
do
  sort -n ${f} \
  | awk ' { a[i++]=$6; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }'
done) > 2020-02-25-TEOverlap-percentMeth-Medians.txt
```

```{bash}
cat 2020-02-25-TEOverlap-percentMeth-Medians.txt
```

## Intergenic regions

```{bash}
head -n 1 ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_intergenic_yrv.bed #Check track
```

### Run `intersectBed`

```{bash}
#For each file that ends in posOnly
#Use intersectBed to find where loci and promoters intersect
#wb: Print all lines in the second file
#a: file that ends in posOnly
#b: exon track
#Only keep specified columns
#Save output in a new file that has the same base name and ends in -intergenicOverlap
for f in *posOnly
do
  /Users/Shared/bioinformatics/bedtools2/bin/intersectBed \
  -wb \
  -a ${f} \
  -b ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_intergenic_yrv.bed \
  | awk '{print $1"\t"$2"\t"$3"\t"$5"\t"$6}' \
  > ${f}-intergenicOverlap
done
```

```{bash}
find *intergenicOverlap
```

```{bash}
head zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-intergenicOverlap #Confirm file is annotated
```

### Add back percent methylation information

```{bash}
#For each file that ends in intergenicOverlap
#Print the first three columns (chr, start, end) with dashes in between, then the rest of the columns in the file
#Save the file with the base name + .sorted
for f in *intergenicOverlap
do
  awk '{print $1"-"$2"-"$3"\t"$0}' ${f} \
  | sort -k1,1 \
  > ${f}.sorted
done
```

```{bash}
find *intergenicOverlap.sorted
```

```{bash}
head -n 1 zr2096_1_s1_R1_val_1_bismark_bt2_pe.deduplicated.bismark.cov_5x.bedgraph.posOnly-intergenicOverlap.sorted
```

```{bash}
#For each file that ends in bedgraph
#Join 2 files using the first column. The files and tab-delimited and the output should also be tab-delimited
#The first file ends with .sorted
#The second file ends with .posOnly-intergenicOverlap.sorted
#Add .annotated.percentMeth to the base name of the output file
for f in *bedgraph
do
  join -j 1 -t $'\t' \
  ${f}.sorted \
  ${f}.posOnly-intergenicOverlap.sorted \
  | awk '{print $2"\t"$3"\t"$4"\t"$9"\t"$10"\t"$5}' \
  > ${f}-intergenicOverlap-percentMeth
done
```

```{bash}
find *intergenicOverlap-percentMeth
```

```{bash}
head 2019-04-09-All-5x-CpGs.bedgraph-intergenicOverlap-percentMeth
```

### Calculate median methylation

```{bash}
#For each file that ends in *intergenicOverlap-percentMeth
#Sort numerically
#Calculate the median of the sixth column (percent methylation) for each file
#Save output in a new file
(for f in *intergenicOverlap-percentMeth
do
  sort -n ${f} \
  | awk ' { a[i++]=$6; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }'
done) > 2020-02-25-intergenicOverlap-percentMeth-Medians.txt
```

```{bash}
cat 2020-02-25-intergenicOverlap-percentMeth-Medians.txt
```

## Plot median methylation

### Reformat data

```{r}
#Import median methylation information

promoterMedMeth <- read.delim("2020-02-25-promoterOverlap-percentMeth-Medians.txt", header = FALSE)
UTRMedMeth <- read.delim("2020-02-25-UTROverlap-percentMeth-Medians.txt", header = FALSE)
exonMedMeth <- read.delim("2020-02-25-exonOverlap-percentMeth-Medians.txt", header = FALSE)
intronMedMeth <- read.delim("2020-02-25-intronOverlap-percentMeth-Medians.txt", header = FALSE)
TEMedMeth <- read.delim("2020-02-25-TEOverlap-percentMeth-Medians.txt", header = FALSE)
intergenicMedMeth <- read.delim("2020-02-25-intergenicOverlap-percentMeth-Medians.txt", header = FALSE)
```

```{r}
featureMedMeth <- cbind(promoterMedMeth, UTRMedMeth, exonMedMeth, intronMedMeth, TEMedMeth, intergenicMedMeth) #Create a dataframe with percent methylation information across features
colnames(featureMedMeth) <- c("promoter", "UTR", "exon", "intron", "TE", "intergenic") #Add column names
featureMedMeth[12,] <- featureMedMeth[2,] #Move sample 10 information to the end
featureMedMeth <- featureMedMeth[-2,] #Remove sample 10 information in the incorrect position
row.names(featureMedMeth) <- c("global", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10") #Add row names with sample distinctions
featureMedMeth <- as.data.frame(t(featureMedMeth)) #Transpose dataframe
tail(featureMedMeth) #Confirm formatting
```


### All median methylation for all features

```{r}
#pdf("2020-02-25-All-Median-Methylation-Across-Features.pdf", height = 8.5, width = 11)
par(mar = c(3,5,1,8)) #Change figure boundaries

barplot(t(featureMedMeth), 
        beside = TRUE, 
        axes = FALSE, 
        names.arg = c("Promoters", "UTR", "Exons", "Introns", "TE", "Intergenic"),
        col = c(plotColors[1],
                rep(plotColors[2], times = 5), 
                rep(plotColors[4], times = 5)),
        ylim = c(0,100)) #Create a grouped barplot (beside = TRUE) using a transposed version of the proportion data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis.

axis(side = 2, at = seq(0, 100, by = 10), las = 2, col = "grey80", cex = 1.2)
mtext(side = 2, "Methylation (%)", line = 3, cex = 1.5) #Add y-axis label

par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot
plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot
legend("topright",
       xpd = TRUE,
       legend = c("Global", "Control", "Treatment"),
       pch = 22, 
        col = "black", 
        pt.bg = c(plotColors[1], plotColors[2], plotColors[4]),
       bty = "n",
       cex = 1.5) #Place a legend in the top right of the figure with no box. pch = 22 has a background and outline. col changes the outline, pt.bg changes the point fill

#dev.off()
```

```{r}
#pdf("2020-02-25-All-Median-Methylation-Across-Split-Features.pdf", height = 8.5, width = 11)
par(mar = c(3,4,0,0), oma = c(0, 2.5, 2, 0), mfrow = c(3,2)) #Change figure boundaries

barplot(t(featureMedMeth[1,]), 
        beside = TRUE, 
        axes = FALSE, 
        names.arg = rep("", times = 11),
        col = c(plotColors[1],
                rep(plotColors[2], times = 5), 
                rep(plotColors[5], times = 5)),
        ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the promoter percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis.
axis(side = 2, at = seq(0, 100, by = 20), las = 2, col = "grey80", cex = 1.2) #Add y-axis
mtext(side = 3, " a. Putative promoters", line = -0.5, adj = 0) #Add figure label
mtext(side = 2, "Methylation (%)", line = 0, cex = 1.5, outer = TRUE) #Add y-axis outer label

barplot(t(featureMedMeth[2,]), 
        beside = TRUE, 
        axes = FALSE, 
        names.arg = rep("", times = 11),
        col = c(plotColors[1],
                rep(plotColors[2], times = 5), 
                rep(plotColors[5], times = 5)),
        ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the UTR percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis.
axis(side = 2, at = seq(0, 100, by = 20), las = 2, col = "grey80", cex = 1.2) #Add y-axis
mtext(side = 3, " b. UTR", line = -0.5, adj = 0) #Add figure label

barplot(t(featureMedMeth[3,]), 
        beside = TRUE, 
        axes = FALSE, 
        names.arg = rep("", times = 11),
        col = c(plotColors[1],
                rep(plotColors[2], times = 5), 
                rep(plotColors[5], times = 5)),
        ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the exon percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis.
axis(side = 2, at = seq(0, 100, by = 20), las = 2, col = "grey80", cex = 1.2) #Add y-axis
mtext(side = 3, " c. Exons", line = -0.5, adj = 0) #Add figure label

barplot(t(featureMedMeth[4,]), 
        beside = TRUE, 
        axes = FALSE, 
        names.arg = rep("", times = 11),
        col = c(plotColors[1],
                rep(plotColors[2], times = 5), 
                rep(plotColors[5], times = 5)),
        ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the intron percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis.
axis(side = 2, at = seq(0, 100, by = 20), las = 2, col = "grey80", cex = 1.2) #Add y-axis
mtext(side = 3, " d. Introns", line = -0.5, adj = 0) #Add figure label

barplot(t(featureMedMeth[5,]), 
        beside = TRUE, 
        axes = FALSE, 
        names.arg = c("G", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"),
        col = c(plotColors[1],
                rep(plotColors[2], times = 5), 
                rep(plotColors[5], times = 5)),
        ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the TE percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis.
axis(side = 2, at = seq(0, 100, by = 20), las = 2, col = "grey80", cex = 1.2) #Add y-axis
mtext(side = 3, " e. Transposable elements", line = -0.5, adj = 0) #Add figure label

barplot(t(featureMedMeth[6,]), 
        beside = TRUE, 
        axes = FALSE, 
        names.arg = c("G", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"),
        col = c(plotColors[1],
                rep(plotColors[2], times = 5), 
                rep(plotColors[5], times = 5)),
        ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the intergenic percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis.
axis(side = 2, at = seq(0, 100, by = 20), las = 2, col = "grey80", cex = 1.2) #Add y-axis
mtext(side = 3, " f. Intergenic regions", line = -0.5, adj = 0) #Add figure label

#dev.off()
```

### Global methylation only

```{r}
#pdf("2020-02-25-Global-Median-Methylation-Across-Features.pdf", height = 8.5, width = 11)
par(mar = c(3,5,1,15)) #Change figure boundaries

barplot(featureMedMeth$global, 
        axes = FALSE, 
        names.arg = c("", "", "", "", "", ""),
        col = plotColors[1:6],
        ylim = c(0,100)) #Create a barplot using a transposed version of the proportion data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis.

axis(side = 2, at = seq(0, 100, by = 10), las = 2, col = "grey80", cex = 1.2)
mtext(side = 2, "Methylation (%)", line = 3, cex = 1.5) #Add y-axis label

par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot
plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot
legend("topright",
       xpd = TRUE,
       legend = c("Putative Promoters", "UTRs", "Exons", "Introns", "Transposable Elements", "Intergenic"),
       pch = 22, 
        col = "black", 
        pt.bg = plotColors[1:6],
       bty = "n",
       cex = 1.5) #Place a legend in the top right of the figure with no box. pch = 22 has a background and outline. col changes the outline, pt.bg changes the point fill

#dev.off()
```

### Samples only

```{r}
#pdf("2020-02-25-Sample-Median-Methylation-Across-Features.pdf", height = 8.5, width = 11)
par(mar = c(3,5,1,8)) #Change figure boundaries

barplot(t(featureMedMeth[,2:11]), 
        beside = TRUE, 
        axes = FALSE, 
        names.arg = c("Promoters", "UTR", "Exons", "Introns", "TE", "Intergenic"),
        col = c(rep(plotColors[2], times = 5), 
                rep(plotColors[4], times = 5)),
        ylim = c(0,100)) #Create a grouped barplot (beside = TRUE) using a transposed version of the proportion data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis.

axis(side = 2, at = seq(0, 100, by = 10), las = 2, col = "grey80", cex = 1.2)
mtext(side = 2, "Methylation (%)", line = 3, cex = 1.5) #Add y-axis label

par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot
plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot
legend("topright",
       xpd = TRUE,
       legend = c("Control", "Treatment"),
       pch = 22, 
        col = "black", 
        pt.bg = c(plotColors[2], plotColors[4]),
       bty = "n",
       cex = 1.5) #Place a legend in the top right of the figure with no box. pch = 22 has a background and outline. col changes the outline, pt.bg changes the point fill

#dev.off()
```

# Create multipanel plot

I'm going to create a multipanel plot with the frequency distribution of CpG types as well as median percent methylation across genome features.

```{r}
#pdf("2020-02-25-Methylation-Distribution-Median-Methylation-Across-Features-Multipanel.pdf", height = 8.5, width = 11)

plotMatrix <- matrix(c(1,1,0,2,3,
                       1,1,0,4,5,
                       1,1,0,6,7),
                     nrow = 3, ncol = 5, byrow = TRUE) #Create a matrix with plot location information. Fill matrix by row.
par(mar = c(2, 1, 0, 0), oma = c(4, 5, 2, 2)) #Specify inner and outer margins
layout(mat = plotMatrix, widths = c(2, 2, 0.7, 2, 2)) #Create a layout based on plotMatrix. The first and second plots (first column) should be wider.
layout.show(n = 7) #Confirm layout looks good

hist(x = cpgMethylation$methylation, axes = FALSE, xlab = "", ylab = "", main = "", col = plotColors[1], xaxs = "i", yaxs = "i") #Create base plot
axis(side = 1, col = "grey80", at = seq(from = 0, to = 100, by = 10), cex.axis = 1.2) #Add x-axis
mtext(side = 1, text = "Methylation (%)", line = 3.5, cex = 1.5) #Add x-axis label
axis(side = 2, col = "grey80", las = 2, labels = c("0", "2", "4", "6"), at = c(0, 2e+05, 4e+05, 6e+05), cex.axis = 1.2) #add y-axis
mtext(side = 2, text = "Frequency (x100,000)", line = 3, cex = 1.5) #Add y-axis label
mtext(side = 3, "a. CpG distribution", line = -1.5, adj = 1) #Add figure label

barplot(t(featureMedMeth[1,]), 
        beside = TRUE, 
        axes = FALSE, 
        names.arg = rep("", times = 11),
        col = c(plotColors[1],
                rep(plotColors[2], times = 5), 
                rep(plotColors[5], times = 5)),
        ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the promoter percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis.
axis(side = 2, at = seq(0, 100, by = 50), las = 2, col = "grey80", cex = 1.2) #Add y-axis
mtext(side = 3, " b. Putative promoters", line = -1.5, adj = 0) #Add figure label

barplot(t(featureMedMeth[2,]), 
        beside = TRUE, 
        axes = FALSE, 
        names.arg = rep("", times = 11),
        col = c(plotColors[1],
                rep(plotColors[2], times = 5), 
                rep(plotColors[5], times = 5)),
        ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the UTR percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis.
mtext(side = 3, " c. UTR", line = -1.5, adj = 0) #Add figure label

barplot(t(featureMedMeth[3,]), 
        beside = TRUE, 
        axes = FALSE, 
        names.arg = rep("", times = 11),
        col = c(plotColors[1],
                rep(plotColors[2], times = 5), 
                rep(plotColors[5], times = 5)),
        ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the exon percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis.
axis(side = 2, at = seq(0, 100, by = 50), las = 2, col = "grey80", cex = 1.2) #Add y-axis
mtext(side = 3, " d. Exons", line = -1.5, adj = 0) #Add figure label
mtext(side = 2, "Methylation (%)", line = 3.5, cex = 1.5) #Add y-axis outer label

barplot(t(featureMedMeth[4,]), 
        beside = TRUE, 
        axes = FALSE, 
        names.arg = rep("", times = 11),
        col = c(plotColors[1],
                rep(plotColors[2], times = 5), 
                rep(plotColors[5], times = 5)),
        ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the intron percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis.
mtext(side = 3, " e. Introns", line = -1.5, adj = 0) #Add figure label

barplot(t(featureMedMeth[5,]), 
        beside = TRUE, 
        axes = FALSE, 
        names.arg = rep("", times = 11),
        col = c(plotColors[1],
                rep(plotColors[2], times = 5), 
                rep(plotColors[5], times = 5)),
        ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the TE percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis.
axis(side = 2, at = seq(0, 100, by = 50), las = 2, col = "grey80", cex = 1.2) #Add y-axis
mtext(side = 3, " f. Transposable elements", line = -1.5, adj = 0) #Add figure label

barplot(t(featureMedMeth[6,]), 
        beside = TRUE, 
        axes = FALSE, 
        names.arg = rep("", times = 11),
        col = c(plotColors[1],
                rep(plotColors[2], times = 5), 
                rep(plotColors[5], times = 5)),
        ylim = c(0,120)) #Create a grouped barplot (beside = TRUE) using a transposed version of the intergenic percent methylation data. Use axes = FALSE to remove the y-axis and names.arg to set labels on the x-axis.
mtext(side = 3, " g. Intergenic regions", line = -1.5, adj = 0) #Add figure label

par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot
plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot
legend(x = 0.16, y = -0.9,
       xpd = TRUE,
       horiz = TRUE,
       x.intersp = 0.5,
       legend = c("Global", "Control", "Treatment"),
       pch = 22, 
        col = "black", 
        pt.bg = plotColors[c(1,2,5)],
       bty = "n",
       cex = 2.2) #Place a horizontal legend at the bottom of genome feature plots with no box using manual x and y coordinates. pch = 22 has a background and outline. col changes the outline, pt.bg changes the point fill

#dev.off()
```


# Methylated 5x CpGs

## Import data

```{r}
allMethLoci <- read.csv("2019-04-09-All-5x-CpG-Loci-Methylated.csv", header = FALSE) #Import file with positions of all methylated loci with 5x coverage
colnames(allMethLoci) <- c("chromosome", "start", "end", "methylation") #Add column names
head(allMethLoci) #Confirm import
```

```{r}
methLociGeneOverlaps <- read.delim("2019-05-29-All5xCpGs-Genes.txt", header = FALSE, sep = "\t") #Import file with overlaps for methylated loci and genes
methLociGeneOverlaps <- methLociGeneOverlaps[,-c(4)] #Remove extra column
colnames(methLociGeneOverlaps) <- c("chromosome", "start", "end", "gene-start", "gene-end") #Add column names
head(methLociGeneOverlaps) #Confirm import
```

## Scaled methylated CpG distribution

Scaling each gene from 0% to 100%, I want to see where methylated CpG are located. This is useful to see if methylation is occuring in any consistent location for each gene. 

```{r}
methLociGeneOverlaps$geneLength <- methLociGeneOverlaps$`gene-end` - methLociGeneOverlaps$`gene-start` #Calculate gene length
methLociGeneOverlaps$absPosition <- methLociGeneOverlaps$start - methLociGeneOverlaps$`gene-start` #Calculate the absolute position of the methylated CpG in the gene
methLociGeneOverlaps$scaledPosition <- methLociGeneOverlaps$absPosition / methLociGeneOverlaps$geneLength #Calculate the scaled position of the methylated CpG in the gene
head(methLociGeneOverlaps) #Confirm calculations
```

```{r}
#pdf("2019-10-10-Scaled-Gene-Methylated-Loci.pdf", height = 8.5, width = 11)

par(mar = c(5, 6, 2, 2)) #Change figure dimensions
hist(methLociGeneOverlaps$scaledPosition, 
     breaks = 100,
     axes = FALSE,
     xlab = "", ylab = "", main = "",
     ylim = c(0,40000),
     col = "grey80",
     xaxs = "i", yaxs = "i") #Create base plot with no axes or labels. Include breaks at each percent.
axis(side = 1, at = seq(from = 0, to = 1, by = 0.1), col = "grey80", cex = 1.2) #Add x-axis
mtext(side = 1, "Scaled Position on Gene", line = 3, cex = 1.5) #Add x-axis label
axis(side = 2, at = seq(from = 0, to = 40000, by = 10000), col = "grey80", las = 2, cex = 1.2) #Add y-axis
mtext(side = 2, "Number of Methylated CpG", line = 4, cex = 1.5) #Add y-axis label

#dev.off()
```