---
title: "A2_edgeR_DS_Sp_RRBS"
author: "Sam Bogan"
date: "5/27/2021"
output: github_document
---

This is an R markdown document detailing edgeR analysis of differential splicing for the Sp_RRBS_ATAC repo, a documentation of analyses by Sam Bogan, Marie Strader, and Gretchen Hofmann that aimed to understand the gene regulatory effects of DNA methylation during transgenerational plasticity in the purple sea urchin *Strongylocentrotus purpuratus* and how these effects are regulated by other epigenomic and genomic states.

The code below reads in and filters an RNAseq count matrix, performs a PCA of each sample, and then fits a multifactorial glm from which pariwise contrasts are made to estimate differential splicing between treatment groups. Developmental treatment: larval S. purpuratus reared in experimental upwelling or non-upwelling conditions. Maternal treatment:  larval S. purpuratus spawned from mothers exposed to experimental upwelling or non-upwelling conditions.

This markdown finishes by outputing six dataframes: 4 containing lists of differential spliced genes (DSGs) and likelihood statistics or binary values for significantfor splicing corresponding to the maternal and developmental treatments and two dataframes containing differential exon use coefficients corresponding to both treatments.

Prior to this analysis, reads were mapped to the Spur_3.1.42 assembly and annotation using HiSat2 and counted using featureCounts in the subread package as detailed in Strader et al. 2020: https://www.frontiersin.org/articles/10.3389/fmars.2020.00205/full. Relevant scripts for alignment and read counting can be found at: https://github.com/mariestrader/S.purp_RRBS_RNAseq_2019.

```{r setup, include=FALSE}

knitr::opts_knit$set( root.dir = '~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/' )

```

#Read in, filter, and multiQC data

```{r}

# Load required packages
library( edgeR )
library( tidyverse )
library( pheatmap )
library( ape )
library( vegan )
library( VennDiagram )
library( plyr )

# Read in exon read count matrix
exon_counts <- read.csv( "Input_data/exon_read_counts.csv" )

# Remove rows corresponding to duplicated exon ids
n_occur_ex <- data.frame( table( exon_counts$Geneid ) )

duplicated_exons <- n_occur_ex[ n_occur_ex$Freq > 1, ]

dup_exon_list <- duplicated_exons$Var1

exon_counts <- exon_counts[ ! exon_counts$Geneid %in% dup_exon_list, ]

# Make gene id matrix rowname
row.names( exon_counts ) <- exon_counts$Geneid

# Subset df to include necessary data only
exon_counts <- subset( exon_counts, select = -c( Geneid, 
                                                 Chr, 
                                                 Start, 
                                                 End, 
                                                 Strand, 
                                                 Length ) )

```

```{r}
# Replace sample IDs with simple names stating treatment groups
colnames( exon_counts ) <- c( "NN1", "NN2", "NN3",
                              "NU1", "NU2", "NU3",
                              "UN1", "UN2", "UN3",
                              "UU1", "UU2", "UU3" )

# Make experimental design tables for edgeR
Mat <- c( "N", "N", "N", "N", "N", "N",
            "U", "U", "U", "U", "U", "U" )

Dev <- c( "N", "N", "N", "U", "U", "U",
          "N", "N", "N", "U", "U", "U" )

targets_ec <- data.frame( Mat, Dev )

targets_ec$grouping <- paste( targets_ec$Mat,
                               targets_ec$Dev,
                               sep="_" )

# Round counts if necessary
data_input_ec <- round( exon_counts )

```

```{r}

# Apply read depth filter from whole genes
load( "Output_data/DGEList_keep.Rdata" ) 

# Look at DGEList_keep
keep_geneids <- gsub( "transcript:",
                      "",
                      gsub( "-tr.*",
                            "",
                            row.names( data_input_ec ) ) )

# Make geneid column in DGEList$counts for filtering based on transcript-level counts
data_input_ec$geneid <- gsub( "-tr.*",
                            "",
                            row.names( data_input_ec ) )

# Filter exon data based on transcript-level read counts
data_input_ec <- dplyr::filter( data_input_ec,
                                geneid %in% keep_geneids )

data_input_ec <- data_input_ec[, !( names( data_input_ec ) == "geneid" ) ]

# Convert counts back to matrix
data_input_ec <- as.matrix( data_input_ec )

# Make a DGEList
DGEList_ex <- DGEList( counts = data_input_ec, 
                       group = targets_ec$grouping, 
                       remove.zeros = T )

# Create library size normalization factors
DGEList_ex <- calcNormFactors( DGEList_ex )


# CPM conversion and log^2 transformation of read counts
DGEList_ex_log <- cpm( DGEList_ex, 
                       log = TRUE, 
                       prior.count = 2 )

# MDS of normalized exon read counts
MDS_ex <- plotMDS( DGEList_ex_log )

MDS_ex

# Run pcoa on gene read counts
pcoa_ec = pcoa( vegdist( t( DGEList_ex_log <- cpm( DGEList_ex,
                                                        log = TRUE,
                                                        prior.count = 2 ) ), 
                              method = "euclidean" ) / 1000 )

# Print sample scores across vectors
head( pcoa_ec$vectors )
```

```{r}

# Create model design that includes maternal and developmental effects and set intercept to 0
design_multi_ec <- model.matrix( ~0 + Mat + Dev )

# Add column names to model matrix
colnames( design_multi_ec ) <- c( "MatN", "MatU", "DevU" ) 

# Estmate mean dispersal for use in plotting common dispersal against tagwise dispersal
DGEList_ex <- estimateDisp( DGEList_ex, 
                       design_multi_ec, 
                       robust = TRUE )

# Plot tagwise dispersal and impose w/mean dispersal and trendline
plotBCV( DGEList_ex )

# Save a vector of exon ids
exons <- row.names( DGEList_ex_log )

# Fit a robust, multifactorial quasi-likelihood glm to normalized read counts
fit_ec <- glmQLFit( DGEList_ex, 
                     design_multi_ec, 
                     robust = TRUE ) 

# Plot shrinkage of Bayesian quasi-likelihood dispersion (not indicative of DSG analysis power as Bayesian shrinkage cannot be used)
plotQLDisp( fit_ec ) # High shrinkage

```

#Perform differential splicing analyses

```{r}
## Maternal DSGs

# Extract exon ids
exon_ids <- row.names( DGEList_ex$counts )

# Trim exon ids down to gene ids
gene_ids <- gsub( "-.*",
                  "", 
                  exon_ids )

# Design contrast between samples based on maternal effect
con_Mat <- makeContrasts( con_Mat = MatU - MatN ,
                          levels = design_multi_ec )

# Apply quasi-likelihood F test to incorporate Bayesian tagwise dispersion estimates as parameter for DEG analysis
Mat_dgelrt <- diffSpliceDGE( fit_ec, 
                               coef = ncol( targets_ec ), 
                               contrast = con_Mat, 
                               gene_ids, 
                               exonid = exon_ids, 
                               prior.count = 0.125, 
                               verbose = TRUE )

## Adjust p-values w/ FDR

# Simes p-value, corresponding to DSGs associated with many differential exon use events of small effect
Mat_dgelrt$Simes_FDR <- p.adjust( Mat_dgelrt$gene.Simes.p.value,
                                  "fdr" )

# Gene p-value, corresponding to DSGs associated with a few differential exon use events of large effect
Mat_dgelrt$gene_FDR <- p.adjust( Mat_dgelrt$gene.p.value,
                                 "fdr" )

# Gene p-value, corresponding to exons undergoing differential use relative to its gene
Mat_dgelrt$exon_FDR <- p.adjust( Mat_dgelrt$exon.p.value,
                                 "fdr" )

## Count significant DSGs and exons

# Simes FDR < 0.05
sum( Mat_dgelrt$Simes_FDR < 0.05 )

# Gene FDR < 0.05
sum( Mat_dgelrt$gene_FDR < 0.05 )

# Exon FDR < 0.05
sum( Mat_dgelrt$exon_FDR < 0.05 )

# Plot -log gene FDR values
plot( -log10(Mat_dgelrt$gene_FDR ) )

```

```{r}
#Dev DSGs

# Design contrast between samples based on maternal effect
con_Dev <- makeContrasts( con_Dev = DevU, 
                          levels = design_multi_ec )

# Apply quasi-likelihood F test to incorporate Bayesian tagwise dispersion estimates as parameter for DEG analysis
Dev_dgelrt <- diffSpliceDGE( fit_ec, 
                             coef = ncol( ldp_targets ), 
                             contrast = con_Dev, 
                             gene_ids, 
                             exonid = exon_ids, 
                             prior.count = 0.125,
                             verbose = TRUE )


## Adjust p-values w/ FDR

# Simes p-value, corresponding to DSGs associated with many differential exon use events of small effect
Dev_dgelrt$Simes_FDR <- p.adjust( Dev_dgelrt$gene.Simes.p.value,
                                  "fdr" )

# Gene p-value, corresponding to DSGs associated with a few differential exon use events of large effect
Dev_dgelrt$gene_FDR <- p.adjust( Dev_dgelrt$gene.p.value,
                                 "fdr" )

# Gene p-value, corresponding to exons undergoing differential use relative to its gene
Dev_dgelrt$exon_FDR <- p.adjust( Dev_dgelrt$exon.p.value,
                                 "fdr" )

## Count significant DSGs and exons

# Simes FDR < 0.05
sum( Dev_dgelrt$Simes_FDR < 0.05 )

# Gene FDR < 0.05
sum( Dev_dgelrt$gene_FDR < 0.05 )

# Exon FDR < 0.05
sum( Dev_dgelrt$exon_FDR < 0.05 )

# Plot -log gene FDR values
plot( -log10(Dev_dgelrt$gene_FDR ) )

```

#Determine which DSGs are due to spuriousness or alternative TSS

Genes exhibiting increased spurious expression or an increase in alternative TSS towar the 3' direction will be identified by diffspliceDGE as significant DSGs. Below, exon DEU coefficients are fit to linear models in order to identify genes with low, negative intercepts and positive slopes across exon number as these parameters are consistent with alt TSS and spuriousness. geneids of spurious and alt TSS will then be filtered from the DSG dataframes corresponding to maternal and developmental treatments.

```{r}

## Create 'exon info' dfs

# First, exon info df for maternal treatment
DSGs_Mat_exon_info <- data.frame( exon_id = Mat_dgelrt$exoncolname, 
                                  exon_coeff = Mat_dgelrt$coefficients, 
                                  exon_pval = Mat_dgelrt$exon.p.value,
                                  Treat = "Maternal" )

# Create exon number variable
DSGs_Mat_exon_info$exon_num <- gsub( ".*-E",
                                     "", 
                                     row.names( DSGs_Mat_exon_info) )
DSGs_Mat_exon_info$exon_num <- as.numeric( DSGs_Mat_exon_info$exon_num )

# Create geneid variable
DSGs_Mat_exon_info$geneid <- gsub( "-tr-E.*",
                                   "", 
                                   row.names( DSGs_Mat_exon_info ) )

# Then, exon info df for Devernal treatment
DSGs_Dev_exon_info <- data.frame( exon_id = Dev_dgelrt$exoncolname, 
                                  exon_coeff = Dev_dgelrt$coefficients, 
                                  exon_pval = Dev_dgelrt$exon.p.value,
                                  Treat = "Developmental" )

# Create exon number variable
DSGs_Dev_exon_info$exon_num <- gsub( ".*-E",
                                     "", 
                                     row.names( DSGs_Dev_exon_info) )
DSGs_Dev_exon_info$exon_num <- as.numeric( DSGs_Dev_exon_info$exon_num )

# Create geneid variable
DSGs_Dev_exon_info$geneid <- gsub( "-tr-E.*",
                                   "", 
                                   row.names( DSGs_Dev_exon_info ) )


# Create df of spuriously expressed genes
DSGs_Mat_exon_info$lm_id <- paste( DSGs_Mat_exon_info$geneid,
                                    DSGs_Mat_exon_info$Treat, 
                                   sep = "_" )

DSGs_Dev_exon_info$lm_id <- paste( DSGs_Dev_exon_info$geneid,
                                    DSGs_Dev_exon_info$Treat, 
                                   sep = "_" )

All_exon_info <- rbind( DSGs_Mat_exon_info, 
                        DSGs_Dev_exon_info )

# Linear models applied to identify DSGs to be filtered out for spurious criterion or alt TSS
spur_models <- dlply( All_exon_info, 
                      c( "lm_id", "geneid", "Treat" ), 
                      function( df ) lm( exon_coeff ~ exon_num, data = df ) )

# Apply coef to each model and return a data frame
spur_models_coef <- ldply( spur_models, 
                           coef )

# Extract p-values from spur models
spur_models_coef <- setNames( spur_models_coef, 
                              c("lm_id", "geneid", "Treat", "intercept", "slope" ) )

# Remove NAs from spuriousness/alt TSS coef df
spur_models_coef <- data.frame( spur_models_coef[ complete.cases( spur_models_coef ), ] )

# Df of model coeffs for DSGs for filtering that requires significant effect of exon number on DEU
non_p_coef_filt <- dplyr::filter( spur_models_coef, intercept < -0.25 &
                                    slope > 0 )

# Subset genes without spurious transcription or alt TSS
non_spur_models = subset( 
  spur_models_coef, !( geneid %in% non_p_coef_filt ) )

# Export df of model coeffs for spurious or alt TSS genes
write.csv( non_p_coef_filt, 
           "Output_data/non_p_coef_filt.csv" )

# Export df of model coeffs for genes without spurious transcription or alt TSS
write.csv( non_spur_models, 
           "Output_data/non_spur_models.csv" )

# Create a vector of genes that should be filtered out of maternal DSG dataset
mat_spur_transcripts <- dplyr::filter( non_p_coef_filt, 
                              Treat == "Maternal" )

# Create a vector of genes that should be filtered out of developmental DSG dataset
dev_spur_transcripts  <- dplyr::filter( non_p_coef_filt, 
                              Treat == "Developmental" )

```

```{r}

## Create lists of DSGs after filtering for with with changes to alt TSS or spuriousness

# Create %notin% syntax
`%notin%` <- negate(`%in%`)

# Mat DSGs df with geneids, p-values, and fdr values
DSGs_Mat_gene_info <- data.frame( gene_p_value = Mat_dgelrt$gene.p.value, 
                                  gene_FDR = Mat_dgelrt$gene_FDR )
DSGs_Mat_gene_info_sig <- DSGs_Mat_gene_info[ ( DSGs_Mat_gene_info$gene_FDR < 0.05 ), ]
DSGs_Mat_gene_info_sig$geneid <- row.names( DSGs_Mat_gene_info_sig )
mat_unfilt_DSG_count <- nrow( DSGs_Mat_gene_info_sig )

# Filter out genes with changes to alt TSS and spuriousness
DSGs_Mat_gene_info_sig <- dplyr::filter( DSGs_Mat_gene_info_sig, 
                                  geneid %notin% mat_spur_transcripts$geneid )

# Pull list of sig DSGs from filtered DSG df
DSGs_Mat_sig_list <- row.names( DSGs_Mat_gene_info_sig )

# What percent of maternal DSGs are left after filtering?
length( DSGs_Mat_sig_list ) / mat_unfilt_DSG_count

# Dev DSGs df with geneids, p-values, and fdr values
DSGs_Dev_gene_info <- data.frame( gene_p_value = Dev_dgelrt$gene.p.value, 
                                  gene_FDR = Dev_dgelrt$gene_FDR )
DSGs_Dev_gene_info_sig <- DSGs_Dev_gene_info[ ( DSGs_Dev_gene_info$gene_FDR < 0.05 ), ]
DSGs_Dev_gene_info_sig$geneid <- row.names( DSGs_Dev_gene_info_sig )
dev_unfilt_DSG_count <- nrow( DSGs_Dev_gene_info_sig )

# Filter out genes with changes to alt TSS and spuriousness
DSGs_Dev_gene_info_sig <- dplyr::filter( DSGs_Dev_gene_info_sig, 
                                  geneid %notin% dev_spur_transcripts$geneid )

# Pull list of sig DSGs from filtered DSG df
DSGs_Dev_sig_list <- row.names( DSGs_Dev_gene_info_sig )

# What percent of developmental DSGs are left after filtering?
1 - length( DSGs_Dev_sig_list ) / dev_unfilt_DSG_count

# Create df's with binary values associated w/ sig gene-level diff splicing for Fisher's exact
mat_DSG_Fishers_df <- data.frame( geneid = row.names( DSGs_Mat_gene_info),
                                  DSG = ifelse( DSGs_Mat_gene_info$gene_FDR < 0.05 &
                                                  DSGs_Mat_gene_info$gene_FDR %in% DSGs_Mat_sig_list,
                                                "1", "0" ) )

dev_DSG_Fishers_df <- data.frame( geneid = row.names( DSGs_Dev_gene_info),
                                  DSG = ifelse( DSGs_Dev_gene_info$gene_FDR < 0.05 &
                                                  DSGs_Dev_gene_info$gene_FDR %in% DSGs_Dev_sig_list,
                                                "1", "0" ) )

# Export DSG Fishers exact df's
write.csv( mat_DSG_Fishers_df, 
           "~/Documents/GitHub/Sp_RRBS_ATAC/C_Models_and_Figures/GO_MWU-master/mat_DSG_Fishers_df.csv", 
           row.names = FALSE )
write.csv( dev_DSG_Fishers_df, 
           "~/Documents/GitHub/Sp_RRBS_ATAC/C_Models_and_Figures/GO_MWU-master/dev_DSG_Fishers_df.csv", 
           row.names = FALSE )

# How many DEU exons responded to maternal and developmental environments after filtering spurious and alt TSS genes?
mat_sig_DEU_exons <- filter( data.frame( geneid = Mat_dgelrt$genes$GeneID,
                    exonid = Mat_dgelrt$genes$ExonID,
                    DEU_coeff = Mat_dgelrt$coefficients,
                    exon_fdr = p.adjust( Mat_dgelrt$exon.p.value, "fdr" ) ),
                    exon_fdr < 0.05 & geneid %notin% mat_spur_transcripts$geneid )

dev_sig_DEU_exons <- filter( data.frame( geneid = Dev_dgelrt$genes$GeneID,
                    exonid = Dev_dgelrt$genes$ExonID,
                    DEU_coeff = Dev_dgelrt$coefficients,
                    exon_fdr = p.adjust( Dev_dgelrt$exon.p.value, "fdr" ) ),
                    exon_fdr < 0.05 & geneid %notin% dev_spur_transcripts$geneid )

# Number of sig DEU exons
nrow( mat_sig_DEU_exons )
nrow( dev_sig_DEU_exons )

# Number of sig dropped exons
nrow( filter( mat_sig_DEU_exons, DEU_coeff < 0 ) )
nrow( filter( dev_sig_DEU_exons, DEU_coeff < 0 ) )

# Number of sig included exons
nrow( filter( mat_sig_DEU_exons, DEU_coeff > 0 ) )
nrow( filter( dev_sig_DEU_exons, DEU_coeff > 0 ) )

```

#Output DEU df and data for GO enrichment

```{r}

## Output maternal and developmental exon use coefficient data

# Filter out exons in DEU df from spurious or alt TSS genes
DSGs_Mat_exon_info <- filter( DSGs_Mat_exon_info,
                              geneid %notin% mat_spur_transcripts$geneid  )

DSGs_Dev_exon_info <- filter( DSGs_Dev_exon_info,
                              geneid %notin% dev_spur_transcripts$geneid  )

# Do the same for gene-level DEU data
DSGs_Mat_gene_info <- filter( DSGs_Mat_gene_info,
                              row.names( DSGs_Mat_gene_info ) %notin% mat_spur_transcripts$geneid  )

DSGs_Dev_gene_info <- filter( DSGs_Dev_gene_info,
                              row.names( DSGs_Dev_gene_info ) %notin% dev_spur_transcripts$geneid  )

# Export CSVs
write.csv( DSGs_Mat_exon_info,
           "Output_data/Mat_DEU_df.csv" )

write.csv( DSGs_Dev_exon_info,
           "Output_data/Dev_DEU_df.csv" )

## Output DSG -log pvals and geneids for gene ontology Mann-Whitney U test

# Create dfs
DSGs_Mat_pval <- data.frame( geneid = row.names( DSGs_Mat_gene_info ), 
                                    neg_log_pval = -log( DSGs_Mat_gene_info$gene_p_value ) )

DSGs_Dev_pval <- data.frame( geneid = row.names( DSGs_Dev_gene_info ), 
                                    neg_log_pval = -log( DSGs_Dev_gene_info$gene_p_value ) )

# Export CSVs
write.csv( DSGs_Mat_pval,
           "~/Documents/GitHub/Sp_RRBS_ATAC/C_Models_and_Figures/GO_MWU-master/DSGs_Mat_pval.csv",
           row.names = FALSE )

write.csv( DSGs_Dev_pval,
           "~/Documents/GitHub/Sp_RRBS_ATAC/C_Models_and_Figures/GO_MWU-master/DSGs_Dev_pval.csv",
           row.names = FALSE )

## Output exon-level DSG -log pvals and geneids for gene ontology Mann-Whitney U test

# Create dfs
DSGs_Mat_exon_pval <- filter(
  data.frame( geneid = DSGs_Mat_exon_info$geneid,
              neg_log_pval = -log( DSGs_Mat_exon_info$exon_pval ) ),
  geneid %notin% mat_spur_transcripts$geneid )

DSGs_Dev_exon_pval <- filter( 
  data.frame( geneid = DSGs_Dev_exon_info$geneid,
              neg_log_pval = -log( DSGs_Dev_exon_info$exon_pval ) ),
  geneid %notin% dev_spur_transcripts$geneid )

# Export CSVs
write.csv( DSGs_Mat_exon_pval,
           "~/Documents/GitHub/Sp_RRBS_ATAC/C_Models_and_Figures/GO_MWU-master/DSGs_Mat_exon_pval.csv",
           row.names = FALSE )

write.csv( DSGs_Dev_exon_pval,
           "~/Documents/GitHub/Sp_RRBS_ATAC/C_Models_and_Figures/GO_MWU-master/DSGs_Dev_exon_pval.csv",
           row.names = FALSE )

## Output exon-level DSG DEU coefficients and geneids for gene ontology Mann-Whitney U test

# Create dfs
DSGs_Mat_exon_coeff <- filter(
  data.frame( geneid = DSGs_Mat_exon_info$geneid,
              coeff = DSGs_Mat_exon_info$exon_coeff ),
  geneid %notin% mat_spur_transcripts$geneid )

DSGs_Dev_exon_coeff <- filter( 
  data.frame( geneid = DSGs_Dev_exon_info$geneid,
              coeff = DSGs_Dev_exon_info$exon_coeff ),
  geneid %notin% dev_spur_transcripts$geneid )

# Export CSVs
write.csv( DSGs_Mat_exon_coeff,
           "~/Documents/GitHub/Sp_RRBS_ATAC/C_Models_and_Figures/GO_MWU-master/DSGs_Mat_exon_coeff.csv",
           row.names = FALSE )

write.csv( DSGs_Dev_exon_coeff,
           "~/Documents/GitHub/Sp_RRBS_ATAC/C_Models_and_Figures/GO_MWU-master/DSGs_Dev_exon_coeff.csv",
           row.names = FALSE )

```

#Create Venn diagram of DEGs and genes with DEU

```{r}

# Read in maternal and developmental DE df's
mat_edgeR_GE_table_filt <- read.csv(
  "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Output_data/mat_edgeR_GE_table_filt.csv" )

dev_edgeR_GE_table_filt <- read.csv(
  "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Output_data/dev_edgeR_GE_table_filt.csv" )

# Extract geneids w/ sig DGE
mat_sig_degs <- filter( mat_edgeR_GE_table_filt,
                        p.adjust( PValue, "fdr" ) < 0.05 )

dev_sig_degs <- filter( dev_edgeR_GE_table_filt,
                        p.adjust( PValue, "fdr" ) < 0.05 )

# Correct geneIDs
mat_sig_degs$X <- gsub( "transcript:", "", gsub( "-tr", "", mat_sig_degs$X ) )
dev_sig_degs$X <- gsub( "transcript:", "", gsub( "-tr", "", dev_sig_degs$X ) )

# Create and export venn diagram of DSGs and DEGS
DSG_DEG_venn <- venn.diagram(
  x = list( mat_sig_degs$X, dev_sig_degs$X, DSGs_Mat_sig_list, DSGs_Dev_sig_list ),
  category.names = c("Mat DEGs" , "Dev DEGs" , "Mat DSGs", "Dev DSGs"),
  filename = "Output_data/DSG_DEG_venn.png",
  output = TRUE
)

```