---
title: "B2_Integr_Sp_RRBS"
author: "Sam Bogan"
date: "6/27/2021"
output: github_document
---

This is an R markdown detailing the integration of datasets measuring baseline DNA methylation, chromatin accessibility, gene expression, and transcript variant occurence in the purple urchin *Strongylocentrotus purpuratus*, as well as differential expression, methylation, exon use (e.g. alternative splicing). This markdown is part of the Sp_RRBS_ATAC repo by Sam Bogan, Marie Strader, and Gretchen Hofmann. This project aimed to understand the gene regulatory effects of DNA methylation during transgenerational plasticity in an invertebrate, and how these effects are regulated by other epigenomic and genomic states.

The code below merges baseline and differential functional genomic data for genes and genomic features that were quantified and exported in section A of the Sp_RRBS_ATAC. The integrated datasets are then modeled and plotted in section C of Sp_RRBS_ATAC.

Prior to this analysis, reads were mapped to the Spur_3.1.42 assembly and annotation using HiSat2 and counted using featureCounts in the subread package as detailed in Strader et al. 2020: https://www.frontiersin.org/articles/10.3389/fmars.2020.00205/full. Relevant scripts for alignment and read counting can be found at: https://github.com/mariestrader/S.purp_RRBS_RNAseq_2019.

```{r setup, include=FALSE}

knitr::opts_knit$set( root.dir = '~/Documents/GitHub/Sp_RRBS_ATAC/B_Integr_ATAC_RRBS_RNA/' )

```

```{r}

# Load required packages
library( tidyverse )
library( plyr )
library( ape )
library( Rmisc )

```

#Read and wrangle chromatin accessibility data

```{r}

# Read in chromatin accesibility df containg genewise Tn5 insert densities per genomic feature
load( "Output_data/ATAC_summary_peakSummaryALL.Rdata" )

SPU_000241 <- filter( peaks_summary_all, spu_id == "SPU_000241" )

# Convert NA counts to 0's for each genomic feature type
peaks_summary_all$peak_density_Introns[ is.na( peaks_summary_all$peak_density_Introns ) ] = 0
peaks_summary_all$peak_density_Intron1[ is.na( peaks_summary_all$peak_density_Intron1 ) ] = 0
peaks_summary_all$peak_density_TSS[ is.na( peaks_summary_all$peak_density_TSS ) ] = 0
peaks_summary_all$peak_density_Exons[ is.na( peaks_summary_all$peak_density_Exons ) ] = 0

# Divide all counts by 3 to create average across three replicates
peaks_summary_all$peak_density_Introns <- ( peaks_summary_all$peak_density_Introns) / 3
peaks_summary_all$peak_density_Intron1 <- ( peaks_summary_all$peak_density_Intron1 ) / 3
peaks_summary_all$peak_density_TSS <- ( peaks_summary_all$peak_density_TSS ) / 3
peaks_summary_all$peak_density_Exons <- ( peaks_summary_all$peak_density_Exons ) / 3

# Convert 'spu_id' column to 'geneid'
names( peaks_summary_all )[ names( peaks_summary_all ) == "spu_id" ] <- "geneid"

### Create an index of exon lengths and intron lengths for normalization of Tn5 insert counts
## Exon = length of summed exon features
## Intron length = gene length - summed exon lengths

# Read in gff file as df
Spur_3.1_gff <- read.gff( "Input_data/Strongylocentrotus_purpuratus.Spur_3.1.42.gff3" )

# Calc gene lengths
Spur_3.1_gff_genes <- filter( Spur_3.1_gff, type == "gene" )
Spur_3.1_gff_genes$geneid <- gsub( ".*ID=gene:", "", Spur_3.1_gff_genes$attributes )
Spur_3.1_gff_genes$geneid <- gsub( ";.*", "", Spur_3.1_gff_genes$geneid )
Spur_3.1_gff_genes$Len <- Spur_3.1_gff_genes$end - Spur_3.1_gff_genes$start

## Calc exon lenghts
# First, output length of each exon
Spur_3.1_gff_exons <- filter( Spur_3.1_gff, type == "exon" )
Spur_3.1_gff_exons$geneid <- gsub( ".*Parent=transcript:", "", Spur_3.1_gff_exons$attributes )
Spur_3.1_gff_exons$geneid <- gsub( "-tr.*", "", Spur_3.1_gff_exons$geneid )
Spur_3.1_gff_exons$Len <- Spur_3.1_gff_exons$end - Spur_3.1_gff_exons$start

## Next, sum all exon lengths per gene
Spur_3.1_gff_exons_sum <- aggregate( Spur_3.1_gff_exons$Len, 
                                     by = list( Category = Spur_3.1_gff_exons$geneid ), 
                                     FUN = sum )

# Adjust column names after aggregating exon legnths per gene
names( Spur_3.1_gff_exons_sum ) = c( "geneid", "Len" )

# Create df of geneids, gene lengths, and summed exon lengths
Spur_3.1_gff_gene_exon_len <- merge( data.frame( gene_len = Spur_3.1_gff_genes$Len,
                                                 geneid = Spur_3.1_gff_genes$geneid ),
                                     data.frame( geneid = Spur_3.1_gff_exons_sum$geneid,
                                                 exon_len = Spur_3.1_gff_exons_sum$Len ), 
                                      by = "geneid" )

# Calculate intron length
Spur_3.1_gff_gene_exon_len$intron_len <- Spur_3.1_gff_gene_exon_len$gene_len - 
  Spur_3.1_gff_gene_exon_len$exon_len

# Merge gene/exon/intron length info with peaks_summary_all
peaks_summary_all_corr <- merge( peaks_summary_all, Spur_3.1_gff_gene_exon_len, by = "geneid" )

# Normalize TSS, intron, and exon accessibility by length
peaks_summary_all_corr$TSS_dens_norm <- ( peaks_summary_all_corr$peak_density_TSS / 1000 )

peaks_summary_all_corr$intron_dens_norm <- as.numeric( 
  ifelse( peaks_summary_all_corr$intron_len == "0", "0",
          peaks_summary_all_corr$peak_density_Introns /
            peaks_summary_all_corr$intron_len ) )

peaks_summary_all_corr$exon_dens_norm <- peaks_summary_all_corr$peak_density_Exons /
  peaks_summary_all_corr$exon_len

#Export peaks_summary_all_corr
write.csv( peaks_summary_all_corr,
           "Output_data/peaks_summary_all_corr.csv",
           row.names = FALSE )

```

#Combine chromatin accessibility and gene expression data

```{r}

## Maternal correlations between promoter DM and transcript DE
# Read in maternal differential expression data
mat_edgeR_GE <- read.csv( "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Output_data/mat_edgeR_GE_table_filt.csv" )

# Change transcript ID to geneid
mat_edgeR_GE$X <- gsub( "transcript:", "", mat_edgeR_GE$X )
mat_edgeR_GE$X <- gsub( "-tr", "", mat_edgeR_GE$X )

# Subset maternal GE df down to geneid, logFC, and logCPM
keep_vars <- c( "X", "logFC", "logCPM" )
mat_edgeR_GE <- mat_edgeR_GE[ keep_vars ]

# Change 'X' column to 'geneid'
names( mat_edgeR_GE )[ names( mat_edgeR_GE ) == "X" ] <- "geneid"

# Merge maternal GE and chromatin access df's
ATAC_dens_by_GE <- merge( mat_edgeR_GE, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
ATAC_dens_by_GE[ is.na( ATAC_dens_by_GE ) ] <- 0

# Output maternal GE by chromatin access df
write.csv( ATAC_dens_by_GE, "Output_data/ATAC_dens_by_GE.csv", row.names = FALSE )

```

#Combine chromatin accessibility and transcript variant/splicing data

```{r}
## Retrieve data on transcript variant number per gene from Spur_3.1 assembly and annotation
# Read in sp3_1 annotation for Spur_3.1
sp3_gff <- read.gff( "Input_data/sp3_1_GCF.gff3" )

# Filter annotation for transcripts only
sp3_transcripts <- filter( sp3_gff, type == "mRNA" )

# Create transcript id and geneid columns
sp3_transcripts$product <- gsub( ".*product=", "", sp3_transcripts$attributes )
sp3_transcripts$Locus <- gsub( ".*gene=", "", sp3_transcripts$attributes )
sp3_transcripts$Locus <- gsub( ";.*", "", sp3_transcripts$Locus )

# Measure number of transcript variants by counting rows per geneid
sp3_transcripts_freq <- data.frame( table( sp3_transcripts$Locus ) )

# Read in Echinobase geneinfo sheet
geneinfo <- read.csv( "Input_data/GenePageGeneralInfo_AllGenes.csv" )

# Filter for rows containing SPU_IDs -> SPU_clean
geneinfo_spu <- filter( geneinfo, grepl( "SPU_", SPU ) )
geneinfo_spu$ID <- gsub( ".*SPU", "", geneinfo_spu$SPU )
geneinfo_spu$ID <- substr( geneinfo_spu$ID, 1, 7 )

# Create geneid column w/ SPU_ID
geneinfo_spu$geneid <- paste( "SPU", geneinfo_spu$ID, sep = "" )

# Add SPU_ID/geneid to transcript variant frequency df
names( sp3_transcripts_freq )[ names( sp3_transcripts_freq ) == "Var1"] <- "Locus"
sp3_transcripts_freq <- merge( sp3_transcripts_freq,
                                    geneinfo_spu,
                                    by = "Locus" )

# Create df of geneid and # of transcript variants
spu_mRNA_var_df <- data.frame( geneid = sp3_transcripts_freq$geneid,
                               mRNA_var = sp3_transcripts_freq$Freq )

# Create binary variable for presence of transcript variants
spu_mRNA_var_df$splice <- ifelse( spu_mRNA_var_df$mRNA_var > 1, TRUE, FALSE )

# Merge transcript spu_mRNA_var_df with chromatin acessibility data
ATAC_dens_by_splicing <- merge( spu_mRNA_var_df, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
ATAC_dens_by_splicing[ is.na( ATAC_dens_by_splicing ) ] <- 0

# Export chromatin accesibility by constitutive splicing data
write.csv( ATAC_dens_by_splicing, "Output_data/ATAC_dens_by_splicing.csv", row.names = FALSE )

```

#Combine diff methylation, diff expression, and chromatin accessibility data

```{r} 

# Change 'X' column to 'geneid' and other ammendments necessary before merging two edgeR tables
names( mat_edgeR_GE )[ names( mat_edgeR_GE ) == "X" ] <- "geneid"
names( mat_edgeR_GE )[ names( mat_edgeR_GE ) == "logFC" ] <- "GE_logFC"
names( mat_edgeR_GE )[ names( mat_edgeR_GE ) == "logCPM" ] <- "GE_logCPM"

# Read in maternal promoter diff meth data
dm_promoter_df <- 
  read.csv( "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Output_data/mat_edgeR_1kb_promoter_DM_df.csv" )

# Subset maternal GE df down to geneid, logFC, and logCPM
dm_promoter_df <- dm_promoter_df[ keep_vars ]

# Change 'X' column to 'geneid'
names( dm_promoter_df )[ names( dm_promoter_df ) == "X" ] <- "geneid"
names( dm_promoter_df )[ names( dm_promoter_df ) == "logFC" ] <- "pr_meth_logFC"
names( dm_promoter_df )[ names( dm_promoter_df ) == "logCPM" ] <- "pr_meth_logCPM"

# Combine DM promoter and DE data (x = methylation; y = gene expression)
dm_promoter_by_DE <- merge( dm_promoter_df, mat_edgeR_GE, by = "geneid" ) # 1,114 rows

# Add chromatin accessibility data
dm_promoter_by_DE_ATAC <- merge( dm_promoter_by_DE, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
dm_promoter_by_DE_ATAC[ is.na( dm_promoter_by_DE_ATAC ) ] <- 0

## Maternal correlations between intron DM and transcript DE

# Read in maternal intron diff meth data
dm_intron_df <- 
  read.csv( "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Output_data/mat_edgeR_intron_gene_DM_df.csv" )

# Fix geneids
dm_intron_df$X <- gsub( "-I.*", "", dm_intron_df$X )

# Change 'X' column to 'geneid'
names( dm_intron_df )[ names( dm_intron_df ) == "X" ] <- "geneid"
names( dm_intron_df )[ names( dm_intron_df ) == "logFC" ] <- "int_meth_logFC"
names( dm_intron_df )[ names( dm_intron_df ) == "logCPM" ] <- "int_meth_logCPM"

# Combine DM promoter and DE data (x = methylation; y = gene expression)
dm_intron_by_DE <- merge( dm_intron_df, mat_edgeR_GE, by = "geneid" ) # 4,028 rows

# Add chromatin accessibility data
dm_intron_by_DE_ATAC <- merge( dm_intron_by_DE, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
dm_intron_by_DE_ATAC[ is.na( dm_intron_by_DE_ATAC ) ] <- 0

# Read in maternal exon diff meth data
dm_exon_df <- 
  read.csv( "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Output_data/mat_edgeR_exon_gene_DM_df.csv" )

# Fix geneids
dm_exon_df$X <- gsub( "-I.*", "", dm_exon_df$X )

# Change 'X' column to 'geneid'
names( dm_exon_df )[ names( dm_exon_df ) == "X" ] <- "geneid"
names( dm_exon_df )[ names( dm_exon_df ) == "logFC" ] <- "ex_meth_logFC"
names( dm_exon_df )[ names( dm_exon_df ) == "logCPM" ] <- "ex_meth_logCPM"

# Combine DM promoter and DE data (x = methylation; y = gene expression)
dm_exon_by_DE <- merge( dm_exon_df, mat_edgeR_GE, by = "geneid" ) # 3,229 rows

# Add chromatin accessibility data
dm_exon_by_DE_ATAC <- merge( dm_exon_by_DE, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
dm_exon_by_DE_ATAC[ is.na( dm_exon_by_DE_ATAC ) ] <- 0

## Merge exon DM, intron DM, and maternal DE dfs
# Subset intron and exon DM df's down to geneid, logFC, and logCPM
keep_vars_dm <- c( 1, 2, 3 )
dm_intron_df <- dm_intron_df[ keep_vars_dm ]
dm_exon_df <- dm_exon_df[ keep_vars_dm ]

# Merge intron and exon DM df's
dm_intron_exon_by_DE <- merge( merge( dm_intron_df, dm_exon_df, by = "geneid" ),
                            mat_edgeR_GE, by = "geneid" )

# Add chromatin accessibility data
dm_intron_exon_by_DE_ATAC <- merge( dm_intron_exon_by_DE, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
dm_intron_exon_by_DE_ATAC[ is.na( dm_intron_exon_by_DE_ATAC ) ] <- 0

# Exports CSV's of diff meth by GE and chromatin access for each genomic feature type or group
write.csv( dm_promoter_by_DE_ATAC, "Output_data/dm_promoter_by_DE_ATAC.csv", row.names = FALSE )
write.csv( dm_intron_by_DE_ATAC, "Output_data/dm_intron_by_DE_ATAC.csv", row.names = FALSE )
write.csv( dm_exon_by_DE_ATAC, "Output_data/dm_exon_by_DE_ATAC.csv", row.names = FALSE )
write.csv( dm_intron_exon_by_DE_ATAC, "Output_data/dm_intron_exon_by_DE_ATAC.csv", row.names = FALSE )

```

#Combine chromatin accessibility and methylation data

Does not include gene expression data

```{r}

# Merge promoter meth and chromatin access
# Add chromatin accessibility data
dm_promoter_by_ATAC <- merge( dm_promoter_df, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
dm_promoter_by_ATAC[ is.na( dm_promoter_by_ATAC ) ] <- 0

# Now, introns
dm_intron_by_ATAC <- merge( dm_intron_df, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
dm_intron_by_ATAC[ is.na( dm_intron_by_ATAC ) ] <- 0

# Now, exons
dm_exon_by_ATAC <- merge( dm_exon_df, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
dm_exon_by_ATAC[ is.na( dm_exon_by_ATAC ) ] <- 0

# Lastly, introns and exons
dm_intron_exon_by_ATAC <- merge( merge( dm_intron_df, dm_exon_df, by = "geneid" ),
                            peaks_summary_all_corr, by = "geneid", all.x = TRUE )
dm_intron_exon_by_ATAC[ is.na( dm_intron_exon_by_ATAC ) ] <- 0

# Exports CSV's of diff meth by chromatin access, without GE, for each genomic feature type or group
write.csv( dm_promoter_by_ATAC, "Output_data/dm_promoter_by_ATAC.csv", row.names = FALSE )
write.csv( dm_intron_by_ATAC, "Output_data/dm_intron_by_ATAC.csv", row.names = FALSE )
write.csv( dm_exon_by_ATAC, "Output_data/dm_exon_by_ATAC.csv", row.names = FALSE )
write.csv( dm_intron_exon_by_DE_ATAC, "Output_data/dm_intron_exon_by_ATAC.csv", row.names = FALSE )

```

#Integrate baseline methylation with chromatin accessibility

```{r}

## Read in % meth per CpG df's from Section A
# Promoters
meth_promoter_perc_meth <- read.csv( 
  "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Input_data/meth_promoter_perc_meth.csv" )
names( meth_promoter_perc_meth )[ names( meth_promoter_perc_meth ) == "V41" ] <- "geneid"

# Introns
meth_intron_perc_meth <- read.csv( 
  "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Input_data/meth_intron_perc_meth.csv" )
meth_intron_perc_meth$geneid <- gsub( "transcript:", "", meth_intron_perc_meth$V41 )
meth_intron_perc_meth$geneid <- gsub( "-tr", "", meth_intron_perc_meth$geneid )

# Exons
meth_exon_perc_meth <- read.csv( 
  "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Input_data/meth_exon_perc_meth.csv" )
meth_exon_perc_meth$geneid <- gsub( "-tr-E*.", "", meth_exon_perc_meth$V41 )
meth_exon_perc_meth$geneid <- gsub( "-E*.", "", meth_exon_perc_meth$geneid )

## Filter for genes with at least 5 CpGs across each genome feature df
# Promoters
tt_promoter <- table( meth_promoter_perc_meth$geneid )
df2_pr <- subset( meth_promoter_perc_meth, geneid %in% names( tt_promoter[ tt_promoter >= 5 ] ) )

# Introns
tt_intron <- table( meth_intron_perc_meth$geneid )
df2_int <- subset( meth_intron_perc_meth, geneid %in% names( tt_intron[ tt_intron >= 5 ] ) )

# Exons
tt_exon <- table( meth_exon_perc_meth$geneid )
df2_ex <- subset( meth_exon_perc_meth, geneid %in% names( tt_exon[ tt_exon >= 5 ] ) )

## Calculate percent methylation summed across given feature type of each gene
# Promoters
promoters_perc_meth <- summarySE( measurevar = "median",
                                  groupvars = "geneid",
                                  data = df2_pr )

# Introns
introns_perc_meth <- summarySE( measurevar = "median",
                                  groupvars = "geneid",
                                  data = df2_int )


# Introns
exons_perc_meth <- summarySE( measurevar = "median",
                                  groupvars = "geneid",
                                  data = df2_ex )

## Merge chromatin accessibility and baseline meth
# Promoters
base_meth_pr_by_ATAC <- merge( promoters_perc_meth, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
base_meth_pr_by_ATAC[ is.na( base_meth_pr_by_ATAC ) ] <- 0

# Introns
base_meth_int_by_ATAC <- merge( introns_perc_meth, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
base_meth_int_by_ATAC[ is.na( base_meth_int_by_ATAC ) ] <- 0

# Exons
base_meth_ex_by_ATAC <- merge( exons_perc_meth, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
base_meth_ex_by_ATAC[ is.na( base_meth_ex_by_ATAC ) ] <- 0

# Exports CSV's of baseline meth by chromatin access, without GE, for each genomic feature type
write.csv( base_meth_pr_by_ATAC, "Output_data/base_meth_pr_by_ATAC.csv", row.names = FALSE )
write.csv( base_meth_int_by_ATAC, "Output_data/base_meth_int_by_ATAC.csv", row.names = FALSE )
write.csv( base_meth_ex_by_ATAC, "Output_data/base_meth_ex_by_ATAC.csv", row.names = FALSE )

```

#Integrate baseline methylation with gene expression and chromatin accessibility

```{r}
## Merge maternal GE df with baseline meth data
# Promoters
pr_meth_by_GE <- merge( promoters_perc_meth, mat_edgeR_GE, by = "geneid" )
pr_meth_by_GE_ATAC <- merge( pr_meth_by_GE, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
pr_meth_by_GE_ATAC[ is.na( pr_meth_by_GE_ATAC ) ] <- 0

# Introns
int_meth_by_GE <- merge( introns_perc_meth, mat_edgeR_GE, by = "geneid" )
int_meth_by_GE_ATAC <- merge( int_meth_by_GE, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
int_meth_by_GE_ATAC[ is.na( int_meth_by_GE_ATAC ) ] <- 0

# Exons
ex_meth_by_GE <- merge( exons_perc_meth, mat_edgeR_GE, by = "geneid" )
ex_meth_by_GE_ATAC <- merge( ex_meth_by_GE, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
ex_meth_by_GE_ATAC[ is.na( ex_meth_by_GE_ATAC ) ] <- 0

# Introns and exons
int_ex_meth_by_GE <- merge( merge( data.frame( exon_meth = exons_perc_meth$median,
                                               geneid = exons_perc_meth$geneid ),
                                   data.frame( intron_meth = introns_perc_meth$median,
                                               geneid = introns_perc_meth$geneid ),
                                   by = "geneid" ),
                            mat_edgeR_GE, by = "geneid")
int_ex_meth_by_GE_ATAC <- merge( int_ex_meth_by_GE, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
int_ex_meth_by_GE_ATAC[ is.na( int_ex_meth_by_GE_ATAC ) ] <- 0

# Exports CSV's of baseline meth by chromatin access WITH GE for each genomic feature type
write.csv( pr_meth_by_GE_ATAC, "Output_data/pr_meth_by_GE_ATAC.csv", row.names = FALSE )
write.csv( int_meth_by_GE_ATAC, "Output_data/int_meth_by_GE_ATAC.csv", row.names = FALSE )
write.csv( ex_meth_by_GE_ATAC, "Output_data/ex_meth_by_GE_ATAC.csv", row.names = FALSE )
write.csv( int_ex_meth_by_GE_ATAC, "Output_data/int_ex_meth_by_GE_ATAC.csv", row.names = FALSE )

```


#Integrate baseline methylation with constitutive splicing and chromatin accessibility

```{r}

## Merge baseline splicing df with baseline meth data
# Promoters
pr_meth_by_splicing <- merge( promoters_perc_meth, spu_mRNA_var_df, by = "geneid" ) #
pr_meth_by_splicing_ATAC <- merge( pr_meth_by_splicing, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
pr_meth_by_splicing_ATAC[ is.na( pr_meth_by_splicing_ATAC ) ] <- 0

# Introns
int_meth_by_splicing <- merge( introns_perc_meth, spu_mRNA_var_df, by = "geneid" ) #
int_meth_by_splicing_ATAC <- merge( int_meth_by_splicing, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
int_meth_by_splicing_ATAC[ is.na( int_meth_by_splicing_ATAC ) ] <- 0

# Exons
ex_meth_by_splicing <- merge( exons_perc_meth, spu_mRNA_var_df, by = "geneid" ) #
ex_meth_by_splicing_ATAC <- merge( ex_meth_by_splicing, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
ex_meth_by_splicing_ATAC[ is.na( ex_meth_by_splicing_ATAC ) ] <- 0

# Introns and exons
int_ex_meth_by_splicing <- merge( merge( data.frame( exon_meth = exons_perc_meth$median,
                                               geneid = exons_perc_meth$geneid ),
                                         data.frame( intron_meth = introns_perc_meth$median,
                                               geneid = introns_perc_meth$geneid ),
                                         by = "geneid" ),
                                  spu_mRNA_var_df, by = "geneid" )
int_ex_meth_by_splicing_ATAC <- merge( int_ex_meth_by_splicing, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
int_ex_meth_by_splicing_ATAC[ is.na( int_ex_meth_by_splicing_ATAC ) ] <- 0

# Exports CSV's of baseline meth by chromatin access WITH GE for each genomic feature type
write.csv( pr_meth_by_splicing_ATAC, "Output_data/pr_meth_by_splicing_ATAC.csv", row.names = FALSE )
write.csv( int_meth_by_splicing_ATAC, "Output_data/int_meth_by_splicing_ATAC.csv", row.names = FALSE )
write.csv( ex_meth_by_splicing_ATAC, "Output_data/ex_meth_by_splicing_ATAC.csv", row.names = FALSE )
write.csv( int_ex_meth_by_splicing_ATAC, "Output_data/int_ex_meth_by_splicing_ATAC.csv", row.names = FALSE )


```

#Integrate differential exon use and exon-level diff meth

```{r}

## Merge exon-level diff meth, diff exon use, and chromatin accessibility data
# Read in data
Mat_DEU_df <- read.csv( "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Output_data/Mat_DEU_df.csv" )
mat_edgeR_exon_DM_df <- read.csv( "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Output_data/mat_edgeR_exon_DM_df.csv" )

# Create exonID variable
names( mat_edgeR_exon_DM_df )[ names( mat_edgeR_exon_DM_df ) == "X" ] <- "exonid"
names( Mat_DEU_df )[ names( Mat_DEU_df ) == "X" ] <- "exonid"

# Create geneid variable where necessary
mat_edgeR_exon_DM_df$geneid <- gsub( "-tr-E.*", "", mat_edgeR_exon_DM_df$exonid )
mat_edgeR_exon_DM_df$geneid <- gsub( "-E.*", "", mat_edgeR_exon_DM_df$geneid )

# Merge data
exon_DM_DEU <- merge( data.frame( exonid = Mat_DEU_df$exonid,
                                  geneid = Mat_DEU_df$geneid,
                                  exon_coeff = Mat_DEU_df$exon_coeff,
                                  exon_num = Mat_DEU_df$exon_num ), 
                      data.frame( exonid = mat_edgeR_exon_DM_df$exonid,
                                  exon_DM_logFC = mat_edgeR_exon_DM_df$logFC ), 
                      by = "exonid" )
exon_DM_DEU_ATAC <- merge( exon_DM_DEU, peaks_summary_all_corr, by = "geneid", all.x = TRUE )
exon_DM_DEU_ATAC[ is.na( exon_DM_DEU_ATAC ) ] <- 0

# Output exon-level DM, DEU, and ATAC data
write.csv( exon_DM_DEU_ATAC, "Output_data/exon_DM_DEU_ATAC.csv", col.names = FALSE )

```

#Create exon-level chromatin accessibility df and integrate w/ methylation data

```{r}

# Read in within-exon Tn5 inserts
load( "~/Documents/GitHub/Sp_RRBS_ATAC/B_Integr_ATAC_RRBS_RNA/Output_data/peakExon_all.Rdata" )

# Read in annotation information for SPU_3.1
gene_info_table_header <- read.table( "~/Documents/GitHub/Sp_RRBS_ATAC/B_Integr_ATAC_RRBS_RNA/Input_data/gene_info_table_header.txt",
                                      header = T, sep = "\t", quote = NULL )

# Convert geneID to factor
peakExon_all$geneId <- as.factor ( peakExon_all$geneId )

# Create exonid variable
peakExon_all$exon_num <- gsub( ".*exon ",
                               "", 
                               peakExon_all$annotation )

peakExon_all$exon_num <- gsub( "of .*",
                               "",
                               peakExon_all$exon_num )

peakExon_all$exonid <- paste( peakExon_all$geneId,
                                peakExon_all$exon_num,
                                sep = "_" )

# Count Tn5 tags per exon
peakCovExon_all <- count( peakExon_all$exonid )

# Adjust column names of exon counts
names( peakCovExon_all ) = c( "exon_common_name", "peak_density_ind_exons" )

# Create common name variable
peakCovExon_all$common_name <- gsub( "_.*", "", peakCovExon_all$exon_common_name )

# Merge peakCovExon_all w/ SPU_IDs
annot <- as.data.frame( gene_info_table_header[ c( 2, 7) ], )
annot <- as.data.frame( sapply( annot, toupper ) )

peakCovExon_all <- merge( peakCovExon_all, annot, by = "common_name", all.x = TRUE )

# Filter peakCovExon_all for genes w/ spu_id's to be merged with methylation and GE data
peakCovExon_all <- filter( peakCovExon_all, spu_id != "NA" )

## Merge peakExon_all w/ exon length info
peakCovExon_all$exonid <- paste( peakCovExon_all$spu_id,
                                 gsub( ".*_", "-tr-E", peakCovExon_all$exon_common_name ),
                                 sep = "" )

# Create a dataframe to w/ exonid and exon length to be combined w/ accessibility data
exon_df_for_merge <- data.frame( 
  exonid = as.character(
    gsub( ".*Name=", "",
        gsub( ";constitutive=.*", "", Spur_3.1_gff_exons$attributes ) ) ),
  exon_length = ( Spur_3.1_gff_exons$end - Spur_3.1_gff_exons$start ) )

# Clean out spaces in exonid's
exon_df_for_merge$exonid <- as.factor( gsub( " ",
                                                "",
                                                exon_df_for_merge$exonid ) )

peakCovExon_all$exonid <- as.factor( gsub( " ",
                                                "",
                                                peakCovExon_all$exonid ) )

peakCovExon_all <- base::merge( exon_df_for_merge,
  peakCovExon_all,
  by.x = "exonid",
  by.y = "exonid",
  all.x = TRUE )

# Add in spu_id's where missing
peakCovExon_all$spu_id <- as.factor( gsub( "-tr-E.*", "", peakCovExon_all$exonid ) )

# Normalize exon access by length
peakCovExon_all$peak_dens_norm_ind_exon <- peakCovExon_all$peak_density_ind_exons / peakCovExon_all$exon_length

# Replace NAs w/ 0 in peak_density_ind_exons and peak_dens_norm_ind_exon
peakCovExon_all$peak_density_ind_exons[ is.na( peakCovExon_all$peak_density_ind_exons ) ] <- 0
peakCovExon_all$peak_dens_norm_ind_exon[ is.na( peakCovExon_all$peak_dens_norm_ind_exon ) ] <- 0

# Remove common_name columns
peakCovExon_all <- peakCovExon_all[ -c( 3, 4 ) ]

# Read in exon-level methylation data
meth_exon_perc_meth <- read.csv( "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Input_data/meth_exon_perc_meth.csv" )

# Change variable V41 to exonid
names( meth_exon_perc_meth )[ names( meth_exon_perc_meth ) == "V41" ] <- "exonid"

## Filter for exons with at least 5 CpGs across in dataset
# Promoters
tt_ex_lvl <- table( meth_exon_perc_meth$exonid )
df2_ex_lvl <- subset( meth_exon_perc_meth, exonid %in% names( tt_ex_lvl[ tt_ex_lvl >= 3 ] ) )

# Calculate percent methylation summed across singe exons
ex_lvl_perc_meth <- summarySE( measurevar = "median",
                                  groupvars = "exonid",
                                  data = df2_ex_lvl )

# Merge exon-level meth data with accessibility data
exon_lvl_meth_ATAC <- merge( ex_lvl_perc_meth,
                             peakCovExon_all,
                             by = "exonid" )

# Output exom-level meth x accessibility data
write.csv( exon_lvl_meth_ATAC, "Output_data/exon_lvl_meth_ATAC.csv" )

# Plot exon access across methylation
ggplot( data = exon_lvl_meth_ATAC,
        aes( y = median, x = peak_dens_norm_ind_exon ) ) +
  geom_point( alpha = 0.25 ) +
  geom_smooth( method = "glm", se = TRUE )

```

#Create genebody-level diff meth by diff expression df w/ ATAC data

```{r}
# Read in genebody-level maternal diff meth data
mat_edgeR_genebody_diff_meth_df <- read.csv( 
  "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Output_data/mat_edgeR_genebody_diff_meth_df.csv" )

# Replace 'X' variable in mat_edgeR_genebody_diff_meth_df w/ 'geneid'
names( mat_edgeR_genebody_diff_meth_df )[ names( mat_edgeR_genebody_diff_meth_df ) == c( "X", 
                                                                                         "logFC", 
                                                                                         "logCPM" ) ] <- c( "geneid",
                                                                                                            "gb_DM_logFC",
                                                                                                            "gb_DM_logCPM" )

# Merge genebody DM df w/ ATAC_dens_by_GE: results in df w/ 5,416 rows
gb_DM_by_GE_ATAC_df <- merge( mat_edgeR_genebody_diff_meth_df, # 8,328 rows before merge
                              ATAC_dens_by_GE, # 16,303 rows
                              by = "geneid" )

# Output .csv of gb_DM_by_GE_ATAC_df
write.csv( gb_DM_by_GE_ATAC_df, "Output_data/gb_DM_by_GE_ATAC_df.csv", row.names = FALSE )

```