---
title: "WGBS"
author: "HM Putnam"
date: "2/5/2022"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

# Load Libraries
```{r, warning=F, message=F}
#install.packages( c("data.table","plyr","reshape2","ggplot2","gridBase","devtools"))
#if (!requireNamespace("BiocManager", quietly=TRUE))
#    install.packages("BiocManager")
#BiocManager::install(c("GenomicRanges","rtracklayer","impute","Rsamtools"))
#install_github("BIMSBbioinfo/genomation",build_vignettes=FALSE)
#BiocManager::install("genomation")

# install the packages
#library('devtools')
library('methylKit')
#library('genomation')
library('GenomicRanges')
library('tidyverse')
library('plotrix')
library('viridis')
library('ggthemes')
library("pheatmap")
```

#obtain tab data files for egg and sperm methylation at X coverage
```{bash}

cd /home/shared/8TB_HDD_02/hputnam/ceabigr/RAnalysis/data

wget -r \
--no-check-certificate \
--quiet \
--no-directories --no-parent \
-P . \
-A *10x.tab \
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/120321-cvBS/

#need protein annotation here 
#https://www.ncbi.nlm.nih.gov/genome/browse/#!/proteins/398/327504%7CCrassostrea%20virginica/

```

# load genome and annotation information
```{r, message=F}
#load genes gff (https://raw.githubusercontent.com/epigeneticstoocean/2018_L18-adult-methylation/main/genome-features/C_virginica-3.0_Gnomon_genes.bed)
Genes <- read.csv("/home/shared/8TB_HDD_02/hputnam/ceabigr/genome-features/C_virginica-3.0_Gnomon_genes.bed", header=F, sep="\t", na.string="NA", skip=0, stringsAsFactors = F) #read in data file
Genes <- Genes[,c(1:4)] #select desired columns only
colnames(Genes) <-c("scaffold", "start", "stop", "gene") #rename columns

#load gene functional annotation
Annot <- read.csv("/home/shared/8TB_HDD_02/hputnam/ceabigr/genome-features/proteins_398_327504.csv")
colnames(Annot)[6] <- "gene"
Annot$gene <- as.character(Annot$gene)
```

#EGGS
## Loading sample information
```{r, message=F}
#load sample information
sample.info <- read.csv("RAnalysis/data/adult-meta.csv", header=T, sep=",", na.string="NA", stringsAsFactors = F) #read in info

sample.info <- sample.info %>%
  filter(Sex=="F")
```

genrate a file for Xcov of all positions found in all samples (e.g., 10X)
```{bash}
multiIntersectBed -i \
/home/shared/8TB_HDD_02/hputnam/ceabigr/RAnalysis/data/*F_R1_val_1_10x.tab > RAnalysis/data/F_meth.10x.bed

```

```{bash}
cat RAnalysis/data/F_meth.10x.bed | awk '$4 ==16' > RAnalysis/data/F_filtered.AllSamps.10x.bed 

```


```{bash}
#Use intersectBed to find where loci and genes intersect, allowing loci to be mapped to annotated genes
#wb: Print all lines in the second file
#a: file that ends in posOnly
#b: annotated gene list
#Save output in a new file that has the same base name and ends in -Annotated.txt
for f in RAnalysis/data/*F_R1_val_1_10x.tab
do
  intersectBed \
  -wb \
  -a ${f} \
  -b RAnalysis/data/C_virginica-3.0_Gnomon_genes.bed \
  > ${f}_gene
done
```


```{bash}

#SKIP IF YOU DON'T WANT TO FILTER OUT LOCI THAT AREN'T PRESENT IN ALL SAMPLES

#intersect with file to subset only those positions found in all samples
for f in RAnalysis/data/*F_R1_val_1_10x.tab_gene
do
  intersectBed \
  -a ${f} \
  -b RAnalysis/data/F_filtered.AllSamps.10x.bed  \
  > ${f}_AllSamps_gene.bed
done

wc -l RAnalysis/data/*AllSamps_gene.bed

```

```{r}
meth.data <- list.files(path = "RAnalysis/data/", pattern = "F_R1_val_1_10x.tab_gene_AllSamps_gene.bed$", full.names=TRUE) %>%
  set_names(.) %>% 
  map_dfr(read.csv,.id="Sample.ID", header=FALSE, sep="\t", na.string="NA", stringsAsFactors = FALSE) %>% 
  dplyr::select(c(Sample.ID:V2, V4:V6,V10)) %>%
  group_by(Sample.ID)

colnames(meth.data) <- c("Sample.ID", "scaffold", "position","per.meth","meth","unmeth","gene")
meth.data$Sample.ID <- gsub("RAnalysis/data//","",meth.data$Sample.ID) #remove extra characters
meth.data$Sample.ID <- gsub("_R1_val_1_10x.tab_gene_AllSamps_gene.bed","",meth.data$Sample.ID) #remove extra characters
meth.data.x <- merge(meth.data, sample.info, by="Sample.ID")

#plotting
allgene <- meth.data.x %>%	
  group_by(gene,Sample.ID, Treatment) %>%
  summarise(mean = mean(per.meth),
            sem = std.error(per.meth))

# pdf("RAnalysis/output/Eggs_gene.methylation.pdf", width=10, height=20)
# ggplot(allgene, aes(y=gene, x=Treatment, fill=mean))+
#   geom_tile(color="white", size=0.01)+
#   scale_fill_viridis(direction = 1,name="Percent Methylation")+
#   facet_wrap(~Sample.ID, ncol=2)+
#   theme_tufte(base_family="Helvetica")
# dev.off()
  

```

Testing for DMGs
```{r}
# Binomial GLM to test for differentially methylated genes
sub_meth_table <- meth.data.x
sub_meth_table$grouped.gene <- paste0(sub_meth_table$Sample.ID, sub_meth_table$gene)
sub_meth_table$factors <- paste0(sub_meth_table$Treatment)


#filter for genes with >5 methylated positions
min.filt <- count(sub_meth_table, vars = c(grouped.gene))
newdata <- min.filt[ which(min.filt$n > 4), ]
sub_meth_table <- sub_meth_table[sub_meth_table$grouped.gene %in% newdata$vars,]

# create data frame to store results
results <- data.frame()
gs <- unique(sub_meth_table$gene)

#first subset the unique dataframes and second run the GLMs
for(i in 1:length(sub_meth_table$gene)){
    
    #subset the dataframe gene by gene
    sub_meth_table1 <- subset(sub_meth_table, gene ==gs[i])
    
    # fit glm position model
    fit <- glm(matrix(c(meth, unmeth), ncol=2) ~ Treatment, 
               data=sub_meth_table1, family=binomial)

    a <- anova(fit, test="LRT")
    
    # capture summary stats to data frame
    df <- data.frame(gene = sub_meth_table1[1,7],
                     pval.treatment1 = a$`Pr(>Chi)`[2],
                     pval.treatment2 = a$`Pr(>Chi)`[3],
                     #position = a$`Pr(>Chi)`[2], 
                     pval.treatment1_x_treatment2 = a$`Pr(>Chi)`[4],
                     #pval.treatment1_x_position = a$`Pr(>Chi)`[2],
                     #pval.treatment2_x_position = a$`Pr(>Chi)`[2],
                     #pval.treatment1_x_pval.treatment2_x_position = a$`Pr(>Chi)`[2],
                     stringsAsFactors = F)

    # bind rows of temporary data frame to the results data frame
    results <- rbind(results, df)
    
  }
# An error will be generated here for contrasts. 
#This potential for contrasts (interactions) is included in the case one wants to examine the role of position of CpG within a gene
#Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : contrasts can be applied only to factors with 2 or more levels
#Continuing the analysis from results line will generate the results in the absence of the contrast (interaction).

results[is.na(results)] <- 0

results$adj.pval.treatment1 <- p.adjust(results$pval.treatment1, method='BH')
results$adj.pval.treatment2 <- p.adjust(results$pval.treatment2, method='BH')
results$adj.pval.treatment1_x_treatment2 <- p.adjust(results$pval.treatment1_x_treatment2, method='BH')
#results$adj.pval.treatment1_x_position <- p.adjust(results$pval.treatment1_x_position, method='BH')
#results$adj.pval.treatment2_x_position <- p.adjust(results$pval.treatment2_x_position, method='BH')
#results$adj.pval.treatment1_x_pval.treatment2_x_position <- p.adjust(results$pval.treatment1_x_pval.treatment2_x_position, method='BH')

sig <-results
sig <- sig[,c(1,2,5)]

# Identifying DMG with significant interaction
sig.Treatment <- sig %>% filter(adj.pval.treatment1<0.05)
sig.Treatment <- sig.Treatment[order(sig.Treatment$adj.pval.treatment1),]
write.csv(sig.Treatment,"RAnalysis/output/DMG_05_Eggs.csv")

# Annotation of Secondary Exposure Interaction DMG
sig.Treatment.annot <- sig.Treatment
sig.Treatment.annot$gene <- gsub("gene-LOC","",sig.Treatment.annot$gene) #remove extra characters
sig.Treatment.annot <- left_join(sig.Treatment.annot, Annot, by="gene")
sig.Treatment.annot <- sig.Treatment.annot[!duplicated(sig.Treatment.annot$gene),]
write.csv(sig.Treatment.annot,"RAnalysis/output/DMG_05_Eggs_Annot.csv")


#Sanity check plotting of top DMGs with sig interaction
#Plotting reaction norm

sub_meth_table %>%	
  filter(gene== sig.Treatment$gene[2]) %>%
  group_by(gene,Treatment) %>%
  summarise(means = mean(per.meth),
            ses = std.error(per.meth)) %>%
ggplot( aes(x=Treatment, y=means, group=Treatment, colour=Treatment, shape=Treatment)) + #plot data
    geom_line(size=0.7, position=position_dodge(.1)) + #plot lines
    scale_colour_manual(values = c("gray", "black")) + #set line color
    geom_point(size=4, position=position_dodge(.1), colour="black") + #set point characteristics
    scale_shape_manual(values=c(1,18,16)) + #set shapes
    geom_errorbar(aes(ymin=means-ses, ymax=means+ses), #plot error bars
    width=0, position=position_dodge(.1), colour="black") + #set error bar characteristics
    xlab("Treatment") + #plot x axis label
    ylab("Percent Methylation") + #plot y axis label
    theme_bw()


```

#DEG Heatmap
```{r}

DMGs <- allgene[allgene$gene %in% sig.Treatment$gene,]
DMGs <- DMGs[,-5]

Data<- DMGs %>% spread(key="gene", value=mean)
Data.long <-Data
row.names(Data.long) <- Data$Sample.ID

Data.long  <- Data.long[,-c(1:2)]
Data.long <- data.matrix(Data.long, rownames.force=F)
Data.long <-t(Data.long)
Data.long <- as.matrix(Data.long)
colnames(Data.long) <- Data$Sample.ID

col_order <- c("16F", "19F", "39F", "44F", "52F", "53F", "54F", 
               "76F", "22F", "29F", "35F", "36F", "3F", "41F", "50F", "77F")
Data.long <- Data.long[, col_order]
Data.long

df<- as.data.frame(Data[,1:2])
df$Treatment <- as.factor(as.character(df$Treatment))
#df$Coral.ID <- as.factor(as.character(df$Coral.ID))
rownames(df) <- Data$Sample.ID 

ann_colors = list(
    Treatment = c(Control="blue", Exposed="red"))


pdf("RAnalysis/output/Eggs_DMG_treatment_heatmap.pdf")
pheatmap(Data.long, color=inferno(10), cluster_rows=T, show_rownames=FALSE,  show_colnames=TRUE, fontsize_row=8,
         scale="row", cluster_cols=F, annotation_col=df, annotation_colors = ann_colors, gaps_col = c(8), cutree_rows=7)
dev.off()

```


#SPERM
## Loading sample information
```{r, message=F}
#load sample information
sample.info <- read.csv("RAnalysis/data/adult-meta.csv", header=T, sep=",", na.string="NA", stringsAsFactors = F) #read in info

sample.info <- sample.info %>%
  filter(Sex=="M")
```

genrate a file for Xcov of all positions found in all samples (e.g., 10X)
```{bash}
multiIntersectBed -i \
RAnalysis/data/*M_R1_val_1_10x.tab > RAnalysis/data/M_meth.10x.bed

```

```{bash}
cat RAnalysis/data/M_meth.10x.bed | awk '$4 ==10' > RAnalysis/data/M_filtered.AllSamps.10x.bed 

```


```{bash}
#Use intersectBed to find where loci and genes intersect, allowing loci to be mapped to annotated genes
#wb: Print all lines in the second file
#a: file that ends in posOnly
#b: annotated gene list
#Save output in a new file that has the same base name and ends in -Annotated.txt
for f in RAnalysis/data/*M_R1_val_1_10x.tab
do
  intersectBed \
  -wb \
  -a ${f} \
  -b RAnalysis/data/C_virginica-3.0_Gnomon_genes.bed \
  > ${f}_gene
done
```


```{bash}
#intersect with file to subset only those positions found in all samples
for f in RAnalysis/data/*M_R1_val_1_10x.tab_gene
do
  intersectBed \
  -a ${f} \
  -b RAnalysis/data/M_filtered.AllSamps.10x.bed  \
  > ${f}_AllSamps_gene.bed
done

wc -l RAnalysis/data/*AllSamps_gene.bed

```

```{r}
meth.data <- list.files(path = "RAnalysis/data/", pattern = "M_R1_val_1_10x.tab_gene_AllSamps_gene.bed$", full.names=TRUE) %>%
  set_names(.) %>% 
  map_dfr(read.csv,.id="Sample.ID", header=FALSE, sep="\t", na.string="NA", stringsAsFactors = FALSE) %>% 
  dplyr::select(c(Sample.ID:V2, V4:V6,V10)) %>%
  group_by(Sample.ID)

colnames(meth.data) <- c("Sample.ID", "scaffold", "position","per.meth","meth","unmeth","gene")
meth.data$Sample.ID <- gsub("RAnalysis/data//","",meth.data$Sample.ID) #remove extra characters
meth.data$Sample.ID <- gsub("_R1_val_1_10x.tab_gene_AllSamps_gene.bed","",meth.data$Sample.ID) #remove extra characters
meth.data.x <- merge(meth.data, sample.info, by="Sample.ID")

#plotting
allgene <- meth.data.x %>%	
  group_by(gene,Sample.ID, Treatment) %>%
  summarise(mean = mean(per.meth),
            sem = std.error(per.meth))

# pdf("RAnalysis/output/Sperm_gene.methylation.pdf", width=10, height=20)
# ggplot(allgene, aes(y=gene, x=Treatment, fill=mean))+
#   geom_tile(color="white", size=0.01)+
#   scale_fill_viridis(direction = 1,name="Percent Methylation")+
#   facet_wrap(~Sample.ID, ncol=2)+
#   theme_tufte(base_family="Helvetica")
# dev.off()
  

```

Testing for DMGs
```{r}
# Binomial GLM to test for differentially methylated genes
sub_meth_table <- meth.data.x
sub_meth_table$grouped.gene <- paste0(sub_meth_table$Sample.ID, sub_meth_table$gene)
sub_meth_table$factors <- paste0(sub_meth_table$Treatment)


#filter for genes with >5 methylated positions
min.filt <- count(sub_meth_table, vars = c(grouped.gene))
newdata <- min.filt[ which(min.filt$n > 4), ]
sub_meth_table <- sub_meth_table[sub_meth_table$grouped.gene %in% newdata$vars,]

# create data frame to store results
results <- data.frame()
gs <- unique(sub_meth_table$gene)

#first subset the unique dataframes and second run the GLMs
for(i in 1:length(sub_meth_table$gene)){
    
    #subset the dataframe gene by gene
    sub_meth_table1 <- subset(sub_meth_table, gene ==gs[i])
    
    # fit glm position model
    fit <- glm(matrix(c(meth, unmeth), ncol=2) ~ Treatment, 
               data=sub_meth_table1, family=binomial)

    a <- anova(fit, test="LRT")
    
    # capture summary stats to data frame
    df <- data.frame(gene = sub_meth_table1[1,7],
                     pval.treatment1 = a$`Pr(>Chi)`[2],
                     pval.treatment2 = a$`Pr(>Chi)`[3],
                     #position = a$`Pr(>Chi)`[2], 
                     pval.treatment1_x_treatment2 = a$`Pr(>Chi)`[4],
                     #pval.treatment1_x_position = a$`Pr(>Chi)`[2],
                     #pval.treatment2_x_position = a$`Pr(>Chi)`[2],
                     #pval.treatment1_x_pval.treatment2_x_position = a$`Pr(>Chi)`[2],
                     stringsAsFactors = F)

    # bind rows of temporary data frame to the results data frame
    results <- rbind(results, df)
    
  }
# An error will be generated here for contrasts. 
#This potential for contrasts (interactions) is included in the case one wants to examine the role of position of CpG within a gene
#Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : contrasts can be applied only to factors with 2 or more levels
#Continuing the analysis from results line will generate the results in the absence of the contrast (interaction).

results[is.na(results)] <- 0

results$adj.pval.treatment1 <- p.adjust(results$pval.treatment1, method='BH')
results$adj.pval.treatment2 <- p.adjust(results$pval.treatment2, method='BH')
results$adj.pval.treatment1_x_treatment2 <- p.adjust(results$pval.treatment1_x_treatment2, method='BH')
#results$adj.pval.treatment1_x_position <- p.adjust(results$pval.treatment1_x_position, method='BH')
#results$adj.pval.treatment2_x_position <- p.adjust(results$pval.treatment2_x_position, method='BH')
#results$adj.pval.treatment1_x_pval.treatment2_x_position <- p.adjust(results$pval.treatment1_x_pval.treatment2_x_position, method='BH')

sig <-results
sig <- sig[,c(1,2,5)]

# Identifying DMG with significant interaction
sig.Treatment <- sig %>% filter(adj.pval.treatment1<0.05)
sig.Treatment <- sig.Treatment[order(sig.Treatment$adj.pval.treatment1),]
write.csv(sig.Treatment,"RAnalysis/output/DMG_05_Sperm.csv")

# Annotation of Secondary Exposure Interaction DMG
sig.Treatment.annot <- sig.Treatment
sig.Treatment.annot$gene <- gsub("gene-LOC","",sig.Treatment.annot$gene) #remove extra characters
sig.Treatment.annot <- left_join(sig.Treatment.annot, Annot, by="gene")
sig.Treatment.annot <- sig.Treatment.annot[!duplicated(sig.Treatment.annot$gene),]
write.csv(sig.Treatment.annot,"RAnalysis/output/DMG_05_Sperm_Annot.csv")

#Sanity check plotting of top DMGs with sig interaction
#Plotting reaction norm

sub_meth_table %>%	
  filter(gene== sig.Treatment$gene[1]) %>%
  group_by(gene,Treatment) %>%
  summarise(means = mean(per.meth),
            ses = std.error(per.meth)) %>%
ggplot( aes(x=Treatment, y=means, group=Treatment, colour=Treatment, shape=Treatment)) + #plot data
    geom_line(size=0.7, position=position_dodge(.1)) + #plot lines
    scale_colour_manual(values = c("gray", "black")) + #set line color
    geom_point(size=4, position=position_dodge(.1), colour="black") + #set point characteristics
    scale_shape_manual(values=c(1,18,16)) + #set shapes
    geom_errorbar(aes(ymin=means-ses, ymax=means+ses), #plot error bars
    width=0, position=position_dodge(.1), colour="black") + #set error bar characteristics
    xlab("Treatment") + #plot x axis label
    ylab("Percent Methylation") + #plot y axis label
    theme_bw()


```

#DEG Heatmap
```{r}

DMGs <- allgene[allgene$gene %in% sig.Treatment$gene,]
DMGs <- DMGs[,-5]

Data<- DMGs %>% spread(key="gene", value=mean)
Data.long <-Data
row.names(Data.long) <- Data$Sample.ID

Data.long  <- Data.long[,-c(1:2)]
Data.long <- data.matrix(Data.long, rownames.force=F)
Data.long <-t(Data.long)
Data.long <- as.matrix(Data.long)
colnames(Data.long) <- Data$Sample.ID

col_order <- c("16F", "19F", "39F", "44F", "52F", "53F", "54F", 
               "76F", "22F", "29F", "35F", "36F", "3F", "41F", "50F", "77F")
Data.long <- Data.long[, col_order]
Data.long

df<- as.data.frame(Data[,1:2])
df$Treatment <- as.factor(as.character(df$Treatment))
#df$Coral.ID <- as.factor(as.character(df$Coral.ID))
rownames(df) <- Data$Sample.ID 

ann_colors = list(
    Treatment = c(Control="blue", Exposed="red"))


pdf("RAnalysis/output/Eggs_DMG_treatment_heatmap.pdf")
pheatmap(Data.long, color=inferno(10), cluster_rows=T, show_rownames=FALSE,  show_colnames=TRUE, fontsize_row=8,
         scale="row", cluster_cols=F, annotation_col=df, annotation_colors = ann_colors, gaps_col = c(8), cutree_rows=7)
dev.off()

```

#ALL SAMPLES
genrate a file for Xcov of all positions found in all samples (e.g., 10X)
```{bash}

awk 'BEGIN{FS=OFS="\t"} {print $1,$2,$3}' /home/shared/8TB_HDD_02/hputnam/ceabigr/RAnalysis/data/M_filtered.AllSamps.10x.bed > /home/shared/8TB_HDD_02/hputnam/ceabigr/RAnalysis/data/M2_filtered.AllSamps.10x.bed

awk 'BEGIN{FS=OFS="\t"} {print $1,$2,$3}' /home/shared/8TB_HDD_02/hputnam/ceabigr/RAnalysis/data/F_filtered.AllSamps.10x.bed > /home/shared/8TB_HDD_02/hputnam/ceabigr/RAnalysis/data/F2_filtered.AllSamps.10x.bed

/home/shared/bedtools2/bin/intersectBed -a /home/shared/8TB_HDD_02/hputnam/ceabigr/RAnalysis/data/M2_filtered.AllSamps.10x.bed -b /home/shared/8TB_HDD_02/hputnam/ceabigr/RAnalysis/data/F2_filtered.AllSamps.10x.bed > /home/shared/8TB_HDD_02/hputnam/ceabigr/RAnalysis/data/meth.filtered.AllSamps.10x.bed

```

```{bash}
#Use intersectBed to find where loci and genes intersect, allowing loci to be mapped to annotated genes
#wb: Print all lines in the second file
#a: file that ends in posOnly
#b: annotated gene list
#Save output in a new file that has the same base name and ends in -Annotated.txt
for f in /home/shared/8TB_HDD_02/hputnam/ceabigr/RAnalysis/data/*_R1_val_1_10x.tab
do
  /home/shared/bedtools2/bin/intersectBed \
  -wb \
  -a ${f} \
  -b /home/shared/8TB_HDD_02/hputnam/ceabigr/genome-features/C_virginica-3.0_Gnomon_genes.bed \
  > ${f}_gene
done
```


```{bash}
#intersect with file to subset only those positions found in all samples
for f in /home/shared/8TB_HDD_02/hputnam/ceabigr/RAnalysis/data/*_R1_val_1_10x.tab_gene
do
  /home/shared/bedtools2/bin/intersectBed \
  -a ${f} \
  -b /home/shared/8TB_HDD_02/hputnam/ceabigr/RAnalysis/data/meth.filtered.AllSamps.10x.bed  \
  > ${f}_SexAllSamps_gene.bed
done

```

```{r}
#load sample information
sample.info <- read.csv("RAnalysis/data/adult-meta.csv", header=T, sep=",", na.string="NA", stringsAsFactors = F) #read in info

meth.data <- list.files(path = "/home/shared/8TB_HDD_02/hputnam/ceabigr/RAnalysis/data/", pattern = "_SexAllSamps_gene.bed$", full.names=TRUE) %>%
  set_names(.) %>% 
  map_dfr(read.csv,.id="Sample.ID", header=FALSE, sep="\t", na.string="NA", stringsAsFactors = FALSE) %>% 
  dplyr::select(c(Sample.ID:V2, V4:V6,V10)) %>%
  group_by(Sample.ID)

colnames(meth.data) <- c("Sample.ID", "scaffold", "position","per.meth","meth","unmeth","gene")
meth.data$Sample.ID <- gsub("/home/shared/8TB_HDD_02/hputnam/ceabigr/RAnalysis/data//","",meth.data$Sample.ID) #remove extra characters
meth.data$Sample.ID <- gsub("_R1_val_1_10x.tab_gene_SexAllSamps_gene.bed","",meth.data$Sample.ID) #remove extra characters
meth.data.x <- merge(meth.data, sample.info, by="Sample.ID")

#plotting
allgene <- meth.data.x %>%	
  group_by(gene,Sample.ID, Treatment,Sex) %>%
  summarise(mean = mean(per.meth),
            sem = std.error(per.meth))
write.csv(allgene, "/home/shared/8TB_HDD_02/hputnam/ceabigr/RAnalysis/data/meanmethgene_10x_allsampssex.csv")

# pdf("RAnalysis/output/Eggs_gene.methylation.pdf", width=10, height=20)
# ggplot(allgene, aes(y=gene, x=Treatment, fill=mean))+
#   geom_tile(color="white", size=0.01)+
#   scale_fill_viridis(direction = 1,name="Percent Methylation")+
#   facet_wrap(~Sample.ID, ncol=2)+
#   theme_tufte(base_family="Helvetica")
# dev.off()
  

```

Testing for DMGs
```{r}
# Binomial GLM to test for differentially methylated genes
sub_meth_table <- meth.data.x
sub_meth_table$grouped.gene <- paste0(sub_meth_table$Sample.ID, sub_meth_table$gene)
sub_meth_table$factors <- paste0(sub_meth_table$Treatment)


#filter for genes with >5 methylated positions
min.filt <- count(sub_meth_table, vars = c(grouped.gene))
newdata <- min.filt[ which(min.filt$n > 4), ]
sub_meth_table <- sub_meth_table[sub_meth_table$grouped.gene %in% newdata$vars,]

# create data frame to store results
results <- data.frame()
gs <- unique(sub_meth_table$gene)

#first subset the unique dataframes and second run the GLMs
for(i in 1:length(sub_meth_table$gene)){
    
    #subset the dataframe gene by gene
    sub_meth_table1 <- subset(sub_meth_table, gene ==gs[i])
    
    # fit glm position model
    fit <- glm(matrix(c(meth, unmeth), ncol=2) ~ Treatment, 
               data=sub_meth_table1, family=binomial)

    a <- anova(fit, test="LRT")
    
    # capture summary stats to data frame
    df <- data.frame(gene = sub_meth_table1[1,7],
                     pval.treatment1 = a$`Pr(>Chi)`[2],
                     pval.treatment2 = a$`Pr(>Chi)`[3],
                     #position = a$`Pr(>Chi)`[2], 
                     pval.treatment1_x_treatment2 = a$`Pr(>Chi)`[4],
                     #pval.treatment1_x_position = a$`Pr(>Chi)`[2],
                     #pval.treatment2_x_position = a$`Pr(>Chi)`[2],
                     #pval.treatment1_x_pval.treatment2_x_position = a$`Pr(>Chi)`[2],
                     stringsAsFactors = F)

    # bind rows of temporary data frame to the results data frame
    results <- rbind(results, df)
    
  }
# An error will be generated here for contrasts. 
#This potential for contrasts (interactions) is included in the case one wants to examine the role of position of CpG within a gene
#Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : contrasts can be applied only to factors with 2 or more levels
#Continuing the analysis from results line will generate the results in the absence of the contrast (interaction).

results[is.na(results)] <- 0

results$adj.pval.treatment1 <- p.adjust(results$pval.treatment1, method='BH')
results$adj.pval.treatment2 <- p.adjust(results$pval.treatment2, method='BH')
results$adj.pval.treatment1_x_treatment2 <- p.adjust(results$pval.treatment1_x_treatment2, method='BH')
#results$adj.pval.treatment1_x_position <- p.adjust(results$pval.treatment1_x_position, method='BH')
#results$adj.pval.treatment2_x_position <- p.adjust(results$pval.treatment2_x_position, method='BH')
#results$adj.pval.treatment1_x_pval.treatment2_x_position <- p.adjust(results$pval.treatment1_x_pval.treatment2_x_position, method='BH')

sig <-results
sig <- sig[,c(1,2,5)]

# Identifying DMG with significant interaction
sig.Treatment <- sig %>% filter(adj.pval.treatment1<0.05)
sig.Treatment <- sig.Treatment[order(sig.Treatment$adj.pval.treatment1),]
write.csv(sig.Treatment,"RAnalysis/output/DMG_05_Eggs.csv")

# Annotation of Secondary Exposure Interaction DMG
sig.Treatment.annot <- sig.Treatment
sig.Treatment.annot$gene <- gsub("gene-LOC","",sig.Treatment.annot$gene) #remove extra characters
sig.Treatment.annot <- left_join(sig.Treatment.annot, Annot, by="gene")
sig.Treatment.annot <- sig.Treatment.annot[!duplicated(sig.Treatment.annot$gene),]
write.csv(sig.Treatment.annot,"RAnalysis/output/DMG_05_Eggs_Annot.csv")


#Sanity check plotting of top DMGs with sig interaction
#Plotting reaction norm

sub_meth_table %>%	
  filter(gene== sig.Treatment$gene[2]) %>%
  group_by(gene,Treatment) %>%
  summarise(means = mean(per.meth),
            ses = std.error(per.meth)) %>%
ggplot( aes(x=Treatment, y=means, group=Treatment, colour=Treatment, shape=Treatment)) + #plot data
    geom_line(size=0.7, position=position_dodge(.1)) + #plot lines
    scale_colour_manual(values = c("gray", "black")) + #set line color
    geom_point(size=4, position=position_dodge(.1), colour="black") + #set point characteristics
    scale_shape_manual(values=c(1,18,16)) + #set shapes
    geom_errorbar(aes(ymin=means-ses, ymax=means+ses), #plot error bars
    width=0, position=position_dodge(.1), colour="black") + #set error bar characteristics
    xlab("Treatment") + #plot x axis label
    ylab("Percent Methylation") + #plot y axis label
    theme_bw()


```

#DEG Heatmap
```{r}

DMGs <- allgene[allgene$gene %in% sig.Treatment$gene,]
DMGs <- DMGs[,-5]

Data<- DMGs %>% spread(key="gene", value=mean)
Data.long <-Data
row.names(Data.long) <- Data$Sample.ID

Data.long  <- Data.long[,-c(1:2)]
Data.long <- data.matrix(Data.long, rownames.force=F)
Data.long <-t(Data.long)
Data.long <- as.matrix(Data.long)
colnames(Data.long) <- Data$Sample.ID

col_order <- c("16F", "19F", "39F", "44F", "52F", "53F", "54F", 
               "76F", "22F", "29F", "35F", "36F", "3F", "41F", "50F", "77F")
Data.long <- Data.long[, col_order]
Data.long

df<- as.data.frame(Data[,1:2])
df$Treatment <- as.factor(as.character(df$Treatment))
#df$Coral.ID <- as.factor(as.character(df$Coral.ID))
rownames(df) <- Data$Sample.ID 

ann_colors = list(
    Treatment = c(Control="blue", Exposed="red"))


pdf("RAnalysis/output/Eggs_DMG_treatment_heatmap.pdf")
pheatmap(Data.long, color=inferno(10), cluster_rows=T, show_rownames=FALSE,  show_colnames=TRUE, fontsize_row=8,
         scale="row", cluster_cols=F, annotation_col=df, annotation_colors = ann_colors, gaps_col = c(8), cutree_rows=7)
dev.off()

```