--- title: "B2_Integr_Sp_RRBS" author: "Sam Bogan" date: "6/27/2021" output: github_document --- This is an R markdown detailing the integration of datasets measuring baseline DNA methylation, chromatin accessibility, gene expression, and transcript variant occurence in the purple urchin *Strongylocentrotus purpuratus*, as well as differential expression, methylation, exon use (e.g. alternative splicing). This markdown is part of the Sp_RRBS_ATAC repo by Sam Bogan, Marie Strader, and Gretchen Hofmann. This project aimed to understand the gene regulatory effects of DNA methylation during transgenerational plasticity in an invertebrate, and how these effects are regulated by other epigenomic and genomic states. The code below merges baseline and differential functional genomic data for genes and genomic features that were quantified and exported in section A of the Sp_RRBS_ATAC. The integrated datasets are then modeled and plotted in section C of Sp_RRBS_ATAC. Prior to this analysis, reads were mapped to the Spur_3.1.42 assembly and annotation using HiSat2 and counted using featureCounts in the subread package as detailed in Strader et al. 2020: https://www.frontiersin.org/articles/10.3389/fmars.2020.00205/full. Relevant scripts for alignment and read counting can be found at: https://github.com/mariestrader/S.purp_RRBS_RNAseq_2019. ```{r setup, include=FALSE} knitr::opts_knit$set( root.dir = '~/Documents/GitHub/Sp_RRBS_ATAC/B_Integr_ATAC_RRBS_RNA/' ) ``` ```{r} # Load required packages library( tidyverse ) library( plyr ) library( ape ) library( Rmisc ) ``` #Read and wrangle chromatin accessibility data ```{r} # Read in chromatin accesibility df containg genewise Tn5 insert densities per genomic feature load( "Output_data/ATAC_summary_peakSummaryALL.Rdata" ) SPU_000241 <- filter( peaks_summary_all, spu_id == "SPU_000241" ) # Convert NA counts to 0's for each genomic feature type peaks_summary_all$peak_density_Introns[ is.na( peaks_summary_all$peak_density_Introns ) ] = 0 peaks_summary_all$peak_density_Intron1[ is.na( peaks_summary_all$peak_density_Intron1 ) ] = 0 peaks_summary_all$peak_density_TSS[ is.na( peaks_summary_all$peak_density_TSS ) ] = 0 peaks_summary_all$peak_density_Exons[ is.na( peaks_summary_all$peak_density_Exons ) ] = 0 # Divide all counts by 3 to create average across three replicates peaks_summary_all$peak_density_Introns <- ( peaks_summary_all$peak_density_Introns) / 3 peaks_summary_all$peak_density_Intron1 <- ( peaks_summary_all$peak_density_Intron1 ) / 3 peaks_summary_all$peak_density_TSS <- ( peaks_summary_all$peak_density_TSS ) / 3 peaks_summary_all$peak_density_Exons <- ( peaks_summary_all$peak_density_Exons ) / 3 # Convert 'spu_id' column to 'geneid' names( peaks_summary_all )[ names( peaks_summary_all ) == "spu_id" ] <- "geneid" ### Create an index of exon lengths and intron lengths for normalization of Tn5 insert counts ## Exon = length of summed exon features ## Intron length = gene length - summed exon lengths # Read in gff file as df Spur_3.1_gff <- read.gff( "Input_data/Strongylocentrotus_purpuratus.Spur_3.1.42.gff3" ) # Calc gene lengths Spur_3.1_gff_genes <- filter( Spur_3.1_gff, type == "gene" ) Spur_3.1_gff_genes$geneid <- gsub( ".*ID=gene:", "", Spur_3.1_gff_genes$attributes ) Spur_3.1_gff_genes$geneid <- gsub( ";.*", "", Spur_3.1_gff_genes$geneid ) Spur_3.1_gff_genes$Len <- Spur_3.1_gff_genes$end - Spur_3.1_gff_genes$start ## Calc exon lenghts # First, output length of each exon Spur_3.1_gff_exons <- filter( Spur_3.1_gff, type == "exon" ) Spur_3.1_gff_exons$geneid <- gsub( ".*Parent=transcript:", "", Spur_3.1_gff_exons$attributes ) Spur_3.1_gff_exons$geneid <- gsub( "-tr.*", "", Spur_3.1_gff_exons$geneid ) Spur_3.1_gff_exons$Len <- Spur_3.1_gff_exons$end - Spur_3.1_gff_exons$start ## Next, sum all exon lengths per gene Spur_3.1_gff_exons_sum <- aggregate( Spur_3.1_gff_exons$Len, by = list( Category = Spur_3.1_gff_exons$geneid ), FUN = sum ) # Adjust column names after aggregating exon legnths per gene names( Spur_3.1_gff_exons_sum ) = c( "geneid", "Len" ) # Create df of geneids, gene lengths, and summed exon lengths Spur_3.1_gff_gene_exon_len <- merge( data.frame( gene_len = Spur_3.1_gff_genes$Len, geneid = Spur_3.1_gff_genes$geneid ), data.frame( geneid = Spur_3.1_gff_exons_sum$geneid, exon_len = Spur_3.1_gff_exons_sum$Len ), by = "geneid" ) # Calculate intron length Spur_3.1_gff_gene_exon_len$intron_len <- Spur_3.1_gff_gene_exon_len$gene_len - Spur_3.1_gff_gene_exon_len$exon_len # Merge gene/exon/intron length info with peaks_summary_all peaks_summary_all_corr <- merge( peaks_summary_all, Spur_3.1_gff_gene_exon_len, by = "geneid" ) # Normalize TSS, intron, and exon accessibility by length peaks_summary_all_corr$TSS_dens_norm <- ( peaks_summary_all_corr$peak_density_TSS / 1000 ) peaks_summary_all_corr$intron_dens_norm <- as.numeric( ifelse( peaks_summary_all_corr$intron_len == "0", "0", peaks_summary_all_corr$peak_density_Introns / peaks_summary_all_corr$intron_len ) ) peaks_summary_all_corr$exon_dens_norm <- peaks_summary_all_corr$peak_density_Exons / peaks_summary_all_corr$exon_len #Export peaks_summary_all_corr write.csv( peaks_summary_all_corr, "Output_data/peaks_summary_all_corr.csv", row.names = FALSE ) ``` #Combine chromatin accessibility and gene expression data ```{r} ## Maternal correlations between promoter DM and transcript DE # Read in maternal differential expression data mat_edgeR_GE <- read.csv( "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Output_data/mat_edgeR_GE_table_filt.csv" ) # Change transcript ID to geneid mat_edgeR_GE$X <- gsub( "transcript:", "", mat_edgeR_GE$X ) mat_edgeR_GE$X <- gsub( "-tr", "", mat_edgeR_GE$X ) # Subset maternal GE df down to geneid, logFC, and logCPM keep_vars <- c( "X", "logFC", "logCPM" ) mat_edgeR_GE <- mat_edgeR_GE[ keep_vars ] # Change 'X' column to 'geneid' names( mat_edgeR_GE )[ names( mat_edgeR_GE ) == "X" ] <- "geneid" # Merge maternal GE and chromatin access df's ATAC_dens_by_GE <- merge( mat_edgeR_GE, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) ATAC_dens_by_GE[ is.na( ATAC_dens_by_GE ) ] <- 0 # Output maternal GE by chromatin access df write.csv( ATAC_dens_by_GE, "Output_data/ATAC_dens_by_GE.csv", row.names = FALSE ) ``` #Combine chromatin accessibility and transcript variant/splicing data ```{r} ## Retrieve data on transcript variant number per gene from Spur_3.1 assembly and annotation # Read in sp3_1 annotation for Spur_3.1 sp3_gff <- read.gff( "Input_data/sp3_1_GCF.gff3" ) # Filter annotation for transcripts only sp3_transcripts <- filter( sp3_gff, type == "mRNA" ) # Create transcript id and geneid columns sp3_transcripts$product <- gsub( ".*product=", "", sp3_transcripts$attributes ) sp3_transcripts$Locus <- gsub( ".*gene=", "", sp3_transcripts$attributes ) sp3_transcripts$Locus <- gsub( ";.*", "", sp3_transcripts$Locus ) # Measure number of transcript variants by counting rows per geneid sp3_transcripts_freq <- data.frame( table( sp3_transcripts$Locus ) ) # Read in Echinobase geneinfo sheet geneinfo <- read.csv( "Input_data/GenePageGeneralInfo_AllGenes.csv" ) # Filter for rows containing SPU_IDs -> SPU_clean geneinfo_spu <- filter( geneinfo, grepl( "SPU_", SPU ) ) geneinfo_spu$ID <- gsub( ".*SPU", "", geneinfo_spu$SPU ) geneinfo_spu$ID <- substr( geneinfo_spu$ID, 1, 7 ) # Create geneid column w/ SPU_ID geneinfo_spu$geneid <- paste( "SPU", geneinfo_spu$ID, sep = "" ) # Add SPU_ID/geneid to transcript variant frequency df names( sp3_transcripts_freq )[ names( sp3_transcripts_freq ) == "Var1"] <- "Locus" sp3_transcripts_freq <- merge( sp3_transcripts_freq, geneinfo_spu, by = "Locus" ) # Create df of geneid and # of transcript variants spu_mRNA_var_df <- data.frame( geneid = sp3_transcripts_freq$geneid, mRNA_var = sp3_transcripts_freq$Freq ) # Create binary variable for presence of transcript variants spu_mRNA_var_df$splice <- ifelse( spu_mRNA_var_df$mRNA_var > 1, TRUE, FALSE ) # Merge transcript spu_mRNA_var_df with chromatin acessibility data ATAC_dens_by_splicing <- merge( spu_mRNA_var_df, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) ATAC_dens_by_splicing[ is.na( ATAC_dens_by_splicing ) ] <- 0 # Export chromatin accesibility by constitutive splicing data write.csv( ATAC_dens_by_splicing, "Output_data/ATAC_dens_by_splicing.csv", row.names = FALSE ) ``` #Combine diff methylation, diff expression, and chromatin accessibility data ```{r} # Change 'X' column to 'geneid' and other ammendments necessary before merging two edgeR tables names( mat_edgeR_GE )[ names( mat_edgeR_GE ) == "X" ] <- "geneid" names( mat_edgeR_GE )[ names( mat_edgeR_GE ) == "logFC" ] <- "GE_logFC" names( mat_edgeR_GE )[ names( mat_edgeR_GE ) == "logCPM" ] <- "GE_logCPM" # Read in maternal promoter diff meth data dm_promoter_df <- read.csv( "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Output_data/mat_edgeR_1kb_promoter_DM_df.csv" ) # Subset maternal GE df down to geneid, logFC, and logCPM dm_promoter_df <- dm_promoter_df[ keep_vars ] # Change 'X' column to 'geneid' names( dm_promoter_df )[ names( dm_promoter_df ) == "X" ] <- "geneid" names( dm_promoter_df )[ names( dm_promoter_df ) == "logFC" ] <- "pr_meth_logFC" names( dm_promoter_df )[ names( dm_promoter_df ) == "logCPM" ] <- "pr_meth_logCPM" # Combine DM promoter and DE data (x = methylation; y = gene expression) dm_promoter_by_DE <- merge( dm_promoter_df, mat_edgeR_GE, by = "geneid" ) # 1,114 rows # Add chromatin accessibility data dm_promoter_by_DE_ATAC <- merge( dm_promoter_by_DE, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) dm_promoter_by_DE_ATAC[ is.na( dm_promoter_by_DE_ATAC ) ] <- 0 ## Maternal correlations between intron DM and transcript DE # Read in maternal intron diff meth data dm_intron_df <- read.csv( "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Output_data/mat_edgeR_intron_gene_DM_df.csv" ) # Fix geneids dm_intron_df$X <- gsub( "-I.*", "", dm_intron_df$X ) # Change 'X' column to 'geneid' names( dm_intron_df )[ names( dm_intron_df ) == "X" ] <- "geneid" names( dm_intron_df )[ names( dm_intron_df ) == "logFC" ] <- "int_meth_logFC" names( dm_intron_df )[ names( dm_intron_df ) == "logCPM" ] <- "int_meth_logCPM" # Combine DM promoter and DE data (x = methylation; y = gene expression) dm_intron_by_DE <- merge( dm_intron_df, mat_edgeR_GE, by = "geneid" ) # 4,028 rows # Add chromatin accessibility data dm_intron_by_DE_ATAC <- merge( dm_intron_by_DE, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) dm_intron_by_DE_ATAC[ is.na( dm_intron_by_DE_ATAC ) ] <- 0 # Read in maternal exon diff meth data dm_exon_df <- read.csv( "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Output_data/mat_edgeR_exon_gene_DM_df.csv" ) # Fix geneids dm_exon_df$X <- gsub( "-I.*", "", dm_exon_df$X ) # Change 'X' column to 'geneid' names( dm_exon_df )[ names( dm_exon_df ) == "X" ] <- "geneid" names( dm_exon_df )[ names( dm_exon_df ) == "logFC" ] <- "ex_meth_logFC" names( dm_exon_df )[ names( dm_exon_df ) == "logCPM" ] <- "ex_meth_logCPM" # Combine DM promoter and DE data (x = methylation; y = gene expression) dm_exon_by_DE <- merge( dm_exon_df, mat_edgeR_GE, by = "geneid" ) # 3,229 rows # Add chromatin accessibility data dm_exon_by_DE_ATAC <- merge( dm_exon_by_DE, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) dm_exon_by_DE_ATAC[ is.na( dm_exon_by_DE_ATAC ) ] <- 0 ## Merge exon DM, intron DM, and maternal DE dfs # Subset intron and exon DM df's down to geneid, logFC, and logCPM keep_vars_dm <- c( 1, 2, 3 ) dm_intron_df <- dm_intron_df[ keep_vars_dm ] dm_exon_df <- dm_exon_df[ keep_vars_dm ] # Merge intron and exon DM df's dm_intron_exon_by_DE <- merge( merge( dm_intron_df, dm_exon_df, by = "geneid" ), mat_edgeR_GE, by = "geneid" ) # Add chromatin accessibility data dm_intron_exon_by_DE_ATAC <- merge( dm_intron_exon_by_DE, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) dm_intron_exon_by_DE_ATAC[ is.na( dm_intron_exon_by_DE_ATAC ) ] <- 0 # Exports CSV's of diff meth by GE and chromatin access for each genomic feature type or group write.csv( dm_promoter_by_DE_ATAC, "Output_data/dm_promoter_by_DE_ATAC.csv", row.names = FALSE ) write.csv( dm_intron_by_DE_ATAC, "Output_data/dm_intron_by_DE_ATAC.csv", row.names = FALSE ) write.csv( dm_exon_by_DE_ATAC, "Output_data/dm_exon_by_DE_ATAC.csv", row.names = FALSE ) write.csv( dm_intron_exon_by_DE_ATAC, "Output_data/dm_intron_exon_by_DE_ATAC.csv", row.names = FALSE ) ``` #Combine chromatin accessibility and methylation data Does not include gene expression data ```{r} # Merge promoter meth and chromatin access # Add chromatin accessibility data dm_promoter_by_ATAC <- merge( dm_promoter_df, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) dm_promoter_by_ATAC[ is.na( dm_promoter_by_ATAC ) ] <- 0 # Now, introns dm_intron_by_ATAC <- merge( dm_intron_df, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) dm_intron_by_ATAC[ is.na( dm_intron_by_ATAC ) ] <- 0 # Now, exons dm_exon_by_ATAC <- merge( dm_exon_df, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) dm_exon_by_ATAC[ is.na( dm_exon_by_ATAC ) ] <- 0 # Lastly, introns and exons dm_intron_exon_by_ATAC <- merge( merge( dm_intron_df, dm_exon_df, by = "geneid" ), peaks_summary_all_corr, by = "geneid", all.x = TRUE ) dm_intron_exon_by_ATAC[ is.na( dm_intron_exon_by_ATAC ) ] <- 0 # Exports CSV's of diff meth by chromatin access, without GE, for each genomic feature type or group write.csv( dm_promoter_by_ATAC, "Output_data/dm_promoter_by_ATAC.csv", row.names = FALSE ) write.csv( dm_intron_by_ATAC, "Output_data/dm_intron_by_ATAC.csv", row.names = FALSE ) write.csv( dm_exon_by_ATAC, "Output_data/dm_exon_by_ATAC.csv", row.names = FALSE ) write.csv( dm_intron_exon_by_DE_ATAC, "Output_data/dm_intron_exon_by_ATAC.csv", row.names = FALSE ) ``` #Integrate baseline methylation with chromatin accessibility ```{r} ## Read in % meth per CpG df's from Section A # Promoters meth_promoter_perc_meth <- read.csv( "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Input_data/meth_promoter_perc_meth.csv" ) names( meth_promoter_perc_meth )[ names( meth_promoter_perc_meth ) == "V41" ] <- "geneid" # Introns meth_intron_perc_meth <- read.csv( "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Input_data/meth_intron_perc_meth.csv" ) meth_intron_perc_meth$geneid <- gsub( "transcript:", "", meth_intron_perc_meth$V41 ) meth_intron_perc_meth$geneid <- gsub( "-tr", "", meth_intron_perc_meth$geneid ) # Exons meth_exon_perc_meth <- read.csv( "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Input_data/meth_exon_perc_meth.csv" ) meth_exon_perc_meth$geneid <- gsub( "-tr-E*.", "", meth_exon_perc_meth$V41 ) meth_exon_perc_meth$geneid <- gsub( "-E*.", "", meth_exon_perc_meth$geneid ) ## Filter for genes with at least 5 CpGs across each genome feature df # Promoters tt_promoter <- table( meth_promoter_perc_meth$geneid ) df2_pr <- subset( meth_promoter_perc_meth, geneid %in% names( tt_promoter[ tt_promoter >= 5 ] ) ) # Introns tt_intron <- table( meth_intron_perc_meth$geneid ) df2_int <- subset( meth_intron_perc_meth, geneid %in% names( tt_intron[ tt_intron >= 5 ] ) ) # Exons tt_exon <- table( meth_exon_perc_meth$geneid ) df2_ex <- subset( meth_exon_perc_meth, geneid %in% names( tt_exon[ tt_exon >= 5 ] ) ) ## Calculate percent methylation summed across given feature type of each gene # Promoters promoters_perc_meth <- summarySE( measurevar = "median", groupvars = "geneid", data = df2_pr ) # Introns introns_perc_meth <- summarySE( measurevar = "median", groupvars = "geneid", data = df2_int ) # Introns exons_perc_meth <- summarySE( measurevar = "median", groupvars = "geneid", data = df2_ex ) ## Merge chromatin accessibility and baseline meth # Promoters base_meth_pr_by_ATAC <- merge( promoters_perc_meth, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) base_meth_pr_by_ATAC[ is.na( base_meth_pr_by_ATAC ) ] <- 0 # Introns base_meth_int_by_ATAC <- merge( introns_perc_meth, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) base_meth_int_by_ATAC[ is.na( base_meth_int_by_ATAC ) ] <- 0 # Exons base_meth_ex_by_ATAC <- merge( exons_perc_meth, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) base_meth_ex_by_ATAC[ is.na( base_meth_ex_by_ATAC ) ] <- 0 # Exports CSV's of baseline meth by chromatin access, without GE, for each genomic feature type write.csv( base_meth_pr_by_ATAC, "Output_data/base_meth_pr_by_ATAC.csv", row.names = FALSE ) write.csv( base_meth_int_by_ATAC, "Output_data/base_meth_int_by_ATAC.csv", row.names = FALSE ) write.csv( base_meth_ex_by_ATAC, "Output_data/base_meth_ex_by_ATAC.csv", row.names = FALSE ) ``` #Integrate baseline methylation with gene expression and chromatin accessibility ```{r} ## Merge maternal GE df with baseline meth data # Promoters pr_meth_by_GE <- merge( promoters_perc_meth, mat_edgeR_GE, by = "geneid" ) pr_meth_by_GE_ATAC <- merge( pr_meth_by_GE, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) pr_meth_by_GE_ATAC[ is.na( pr_meth_by_GE_ATAC ) ] <- 0 # Introns int_meth_by_GE <- merge( introns_perc_meth, mat_edgeR_GE, by = "geneid" ) int_meth_by_GE_ATAC <- merge( int_meth_by_GE, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) int_meth_by_GE_ATAC[ is.na( int_meth_by_GE_ATAC ) ] <- 0 # Exons ex_meth_by_GE <- merge( exons_perc_meth, mat_edgeR_GE, by = "geneid" ) ex_meth_by_GE_ATAC <- merge( ex_meth_by_GE, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) ex_meth_by_GE_ATAC[ is.na( ex_meth_by_GE_ATAC ) ] <- 0 # Introns and exons int_ex_meth_by_GE <- merge( merge( data.frame( exon_meth = exons_perc_meth$median, geneid = exons_perc_meth$geneid ), data.frame( intron_meth = introns_perc_meth$median, geneid = introns_perc_meth$geneid ), by = "geneid" ), mat_edgeR_GE, by = "geneid") int_ex_meth_by_GE_ATAC <- merge( int_ex_meth_by_GE, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) int_ex_meth_by_GE_ATAC[ is.na( int_ex_meth_by_GE_ATAC ) ] <- 0 # Exports CSV's of baseline meth by chromatin access WITH GE for each genomic feature type write.csv( pr_meth_by_GE_ATAC, "Output_data/pr_meth_by_GE_ATAC.csv", row.names = FALSE ) write.csv( int_meth_by_GE_ATAC, "Output_data/int_meth_by_GE_ATAC.csv", row.names = FALSE ) write.csv( ex_meth_by_GE_ATAC, "Output_data/ex_meth_by_GE_ATAC.csv", row.names = FALSE ) write.csv( int_ex_meth_by_GE_ATAC, "Output_data/int_ex_meth_by_GE_ATAC.csv", row.names = FALSE ) ``` #Integrate baseline methylation with constitutive splicing and chromatin accessibility ```{r} ## Merge baseline splicing df with baseline meth data # Promoters pr_meth_by_splicing <- merge( promoters_perc_meth, spu_mRNA_var_df, by = "geneid" ) # pr_meth_by_splicing_ATAC <- merge( pr_meth_by_splicing, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) pr_meth_by_splicing_ATAC[ is.na( pr_meth_by_splicing_ATAC ) ] <- 0 # Introns int_meth_by_splicing <- merge( introns_perc_meth, spu_mRNA_var_df, by = "geneid" ) # int_meth_by_splicing_ATAC <- merge( int_meth_by_splicing, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) int_meth_by_splicing_ATAC[ is.na( int_meth_by_splicing_ATAC ) ] <- 0 # Exons ex_meth_by_splicing <- merge( exons_perc_meth, spu_mRNA_var_df, by = "geneid" ) # ex_meth_by_splicing_ATAC <- merge( ex_meth_by_splicing, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) ex_meth_by_splicing_ATAC[ is.na( ex_meth_by_splicing_ATAC ) ] <- 0 # Introns and exons int_ex_meth_by_splicing <- merge( merge( data.frame( exon_meth = exons_perc_meth$median, geneid = exons_perc_meth$geneid ), data.frame( intron_meth = introns_perc_meth$median, geneid = introns_perc_meth$geneid ), by = "geneid" ), spu_mRNA_var_df, by = "geneid" ) int_ex_meth_by_splicing_ATAC <- merge( int_ex_meth_by_splicing, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) int_ex_meth_by_splicing_ATAC[ is.na( int_ex_meth_by_splicing_ATAC ) ] <- 0 # Exports CSV's of baseline meth by chromatin access WITH GE for each genomic feature type write.csv( pr_meth_by_splicing_ATAC, "Output_data/pr_meth_by_splicing_ATAC.csv", row.names = FALSE ) write.csv( int_meth_by_splicing_ATAC, "Output_data/int_meth_by_splicing_ATAC.csv", row.names = FALSE ) write.csv( ex_meth_by_splicing_ATAC, "Output_data/ex_meth_by_splicing_ATAC.csv", row.names = FALSE ) write.csv( int_ex_meth_by_splicing_ATAC, "Output_data/int_ex_meth_by_splicing_ATAC.csv", row.names = FALSE ) ``` #Integrate differential exon use and exon-level diff meth ```{r} ## Merge exon-level diff meth, diff exon use, and chromatin accessibility data # Read in data Mat_DEU_df <- read.csv( "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Output_data/Mat_DEU_df.csv" ) mat_edgeR_exon_DM_df <- read.csv( "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Output_data/mat_edgeR_exon_DM_df.csv" ) # Create exonID variable names( mat_edgeR_exon_DM_df )[ names( mat_edgeR_exon_DM_df ) == "X" ] <- "exonid" names( Mat_DEU_df )[ names( Mat_DEU_df ) == "X" ] <- "exonid" # Create geneid variable where necessary mat_edgeR_exon_DM_df$geneid <- gsub( "-tr-E.*", "", mat_edgeR_exon_DM_df$exonid ) mat_edgeR_exon_DM_df$geneid <- gsub( "-E.*", "", mat_edgeR_exon_DM_df$geneid ) # Merge data exon_DM_DEU <- merge( data.frame( exonid = Mat_DEU_df$exonid, geneid = Mat_DEU_df$geneid, exon_coeff = Mat_DEU_df$exon_coeff, exon_num = Mat_DEU_df$exon_num ), data.frame( exonid = mat_edgeR_exon_DM_df$exonid, exon_DM_logFC = mat_edgeR_exon_DM_df$logFC ), by = "exonid" ) exon_DM_DEU_ATAC <- merge( exon_DM_DEU, peaks_summary_all_corr, by = "geneid", all.x = TRUE ) exon_DM_DEU_ATAC[ is.na( exon_DM_DEU_ATAC ) ] <- 0 # Output exon-level DM, DEU, and ATAC data write.csv( exon_DM_DEU_ATAC, "Output_data/exon_DM_DEU_ATAC.csv", col.names = FALSE ) ``` #Create exon-level chromatin accessibility df and integrate w/ methylation data ```{r} # Read in within-exon Tn5 inserts load( "~/Documents/GitHub/Sp_RRBS_ATAC/B_Integr_ATAC_RRBS_RNA/Output_data/peakExon_all.Rdata" ) # Read in annotation information for SPU_3.1 gene_info_table_header <- read.table( "~/Documents/GitHub/Sp_RRBS_ATAC/B_Integr_ATAC_RRBS_RNA/Input_data/gene_info_table_header.txt", header = T, sep = "\t", quote = NULL ) # Convert geneID to factor peakExon_all$geneId <- as.factor ( peakExon_all$geneId ) # Create exonid variable peakExon_all$exon_num <- gsub( ".*exon ", "", peakExon_all$annotation ) peakExon_all$exon_num <- gsub( "of .*", "", peakExon_all$exon_num ) peakExon_all$exonid <- paste( peakExon_all$geneId, peakExon_all$exon_num, sep = "_" ) # Count Tn5 tags per exon peakCovExon_all <- count( peakExon_all$exonid ) # Adjust column names of exon counts names( peakCovExon_all ) = c( "exon_common_name", "peak_density_ind_exons" ) # Create common name variable peakCovExon_all$common_name <- gsub( "_.*", "", peakCovExon_all$exon_common_name ) # Merge peakCovExon_all w/ SPU_IDs annot <- as.data.frame( gene_info_table_header[ c( 2, 7) ], ) annot <- as.data.frame( sapply( annot, toupper ) ) peakCovExon_all <- merge( peakCovExon_all, annot, by = "common_name", all.x = TRUE ) # Filter peakCovExon_all for genes w/ spu_id's to be merged with methylation and GE data peakCovExon_all <- filter( peakCovExon_all, spu_id != "NA" ) ## Merge peakExon_all w/ exon length info peakCovExon_all$exonid <- paste( peakCovExon_all$spu_id, gsub( ".*_", "-tr-E", peakCovExon_all$exon_common_name ), sep = "" ) # Create a dataframe to w/ exonid and exon length to be combined w/ accessibility data exon_df_for_merge <- data.frame( exonid = as.character( gsub( ".*Name=", "", gsub( ";constitutive=.*", "", Spur_3.1_gff_exons$attributes ) ) ), exon_length = ( Spur_3.1_gff_exons$end - Spur_3.1_gff_exons$start ) ) # Clean out spaces in exonid's exon_df_for_merge$exonid <- as.factor( gsub( " ", "", exon_df_for_merge$exonid ) ) peakCovExon_all$exonid <- as.factor( gsub( " ", "", peakCovExon_all$exonid ) ) peakCovExon_all <- base::merge( exon_df_for_merge, peakCovExon_all, by.x = "exonid", by.y = "exonid", all.x = TRUE ) # Add in spu_id's where missing peakCovExon_all$spu_id <- as.factor( gsub( "-tr-E.*", "", peakCovExon_all$exonid ) ) # Normalize exon access by length peakCovExon_all$peak_dens_norm_ind_exon <- peakCovExon_all$peak_density_ind_exons / peakCovExon_all$exon_length # Replace NAs w/ 0 in peak_density_ind_exons and peak_dens_norm_ind_exon peakCovExon_all$peak_density_ind_exons[ is.na( peakCovExon_all$peak_density_ind_exons ) ] <- 0 peakCovExon_all$peak_dens_norm_ind_exon[ is.na( peakCovExon_all$peak_dens_norm_ind_exon ) ] <- 0 # Remove common_name columns peakCovExon_all <- peakCovExon_all[ -c( 3, 4 ) ] # Read in exon-level methylation data meth_exon_perc_meth <- read.csv( "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Input_data/meth_exon_perc_meth.csv" ) # Change variable V41 to exonid names( meth_exon_perc_meth )[ names( meth_exon_perc_meth ) == "V41" ] <- "exonid" ## Filter for exons with at least 5 CpGs across in dataset # Promoters tt_ex_lvl <- table( meth_exon_perc_meth$exonid ) df2_ex_lvl <- subset( meth_exon_perc_meth, exonid %in% names( tt_ex_lvl[ tt_ex_lvl >= 3 ] ) ) # Calculate percent methylation summed across singe exons ex_lvl_perc_meth <- summarySE( measurevar = "median", groupvars = "exonid", data = df2_ex_lvl ) # Merge exon-level meth data with accessibility data exon_lvl_meth_ATAC <- merge( ex_lvl_perc_meth, peakCovExon_all, by = "exonid" ) # Output exom-level meth x accessibility data write.csv( exon_lvl_meth_ATAC, "Output_data/exon_lvl_meth_ATAC.csv" ) # Plot exon access across methylation ggplot( data = exon_lvl_meth_ATAC, aes( y = median, x = peak_dens_norm_ind_exon ) ) + geom_point( alpha = 0.25 ) + geom_smooth( method = "glm", se = TRUE ) ``` #Create genebody-level diff meth by diff expression df w/ ATAC data ```{r} # Read in genebody-level maternal diff meth data mat_edgeR_genebody_diff_meth_df <- read.csv( "~/Documents/GitHub/Sp_RRBS_ATAC/A_Differential_Exp_Splicing_Meth/Output_data/mat_edgeR_genebody_diff_meth_df.csv" ) # Replace 'X' variable in mat_edgeR_genebody_diff_meth_df w/ 'geneid' names( mat_edgeR_genebody_diff_meth_df )[ names( mat_edgeR_genebody_diff_meth_df ) == c( "X", "logFC", "logCPM" ) ] <- c( "geneid", "gb_DM_logFC", "gb_DM_logCPM" ) # Merge genebody DM df w/ ATAC_dens_by_GE: results in df w/ 5,416 rows gb_DM_by_GE_ATAC_df <- merge( mat_edgeR_genebody_diff_meth_df, # 8,328 rows before merge ATAC_dens_by_GE, # 16,303 rows by = "geneid" ) # Output .csv of gb_DM_by_GE_ATAC_df write.csv( gb_DM_by_GE_ATAC_df, "Output_data/gb_DM_by_GE_ATAC_df.csv", row.names = FALSE ) ```