--- title: "75 max transcript enrichment" author: Steven Roberts date: "`r format(Sys.time(), '%d %B, %Y')`" output: html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true # github_document: # toc: true # toc_depth: 3 # number_sections: true # html_preview: true --- In this document, I'll take information about changes in the maximum number of transcripts expressed and investigate 1) the significance of these changes, 2) functions associated with these changes and 3) determine if there are any overrepresented biological processes associated with changes # Set up R Markdown document ```{r setup, include=FALSE} knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = TRUE, # Evaluate code chunks warning = TRUE, # Hide warnings message = TRUE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) knitr::opts_knit$set(root.dir = normalizePath("../output/75-max-transcript-enrichment/")) #Set root directory ``` ```{r} getwd() ``` # Install packages ```{r setup, include=FALSE} # if ("tidyverse" %in% rownames(installed.packages()) == 'FALSE') install.packages('tidyverse') # if ("ggpubr" %in% rownames(installed.packages()) == 'FALSE') install.packages('ggpubr') # if ("data.table" %in% rownames(installed.packages()) == 'FALSE') install.packages('data.table') # if ("RColorBrewer" %in% rownames(installed.packages()) == 'FALSE') install.packages('RColorBrewer') # if ("patchwork" %in% rownames(installed.packages()) == 'FALSE') install.packages('patchwork') ``` ```{r} # if (!require("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # # BiocManager::install("topGO") ``` ```{r} require(tidyverse) require(ggpubr) require(data.table) require(RColorBrewer) require(patchwork) require(topGO) ``` ```{r} sessionInfo() ``` # Import data ```{r} metadata <- read.csv("../../data/adult-meta.csv", header = TRUE) %>% dplyr::rename(., sample = Sample.ID) %>% dplyr::rename(., treatment = Treatment) %>% mutate(., treatment = gsub(treatment, pattern = "Control", replacement = "control")) %>% mutate(., treatment = gsub(treatment, pattern = "Exposed", replacement = "exposed")) #Import metadata. Change column name and contents to be consistent throughout script head(metadata) ``` ## Maximum transcript data ```{r} female_tr <- read.csv("../../supplemental-files/01.01-fmcoe-max-predom-isos-gene_fpkm.csv") %>% dplyr::select(., c(1:2,4)) %>% mutate(., difference = females_exposed_max_transcript_counts - females_controls_max_transcript_counts) #Import transcript count data per gene. Select only the data for females. Create a difference column for exposed vs. control nrow(female_tr) head(female_tr) ``` ```{r} male_tr <- read.csv("../../supplemental-files/01.01-fmcoe-max-predom-isos-gene_fpkm.csv") %>% dplyr::select(., c(1,3,5)) %>% mutate(., difference = males_exposed_max_transcript_counts - males_controls_max_transcript_counts) #Import transcript count data per gene. Select only the data for females. Create a difference column for exposed vs. control head(male_tr) #Confirm import ``` ## Gene lengths ```{r} cov_data <- fread("https://raw.githubusercontent.com/epigeneticstoocean/2018_L18-adult-methylation/main/analyses/genes_treatment_fpkm_CoV_df.csv", col.names = c("gene_ids", "M_control","F_control", "M_exposed", "F_exposed")) #Import cov data cov_data_l <- cov_data %>% pivot_longer(names_to = "sample", values_to = "CoV_FPKM", cols = 2:5) #Pivot longer head(cov_data_l) #Confirm formatting ``` ```{r} Cvir_genom_feats <- fread("https://gannet.fish.washington.edu/Atumefaciens/20211209_cvir_gff-to-bed/20211209_cvir_GCF_002022765.2_genes.bed", col.names = c("chr", "start", "stop", "gene_ids", "V5", "strand")) %>% mutate(., length = stop - start + 1) #Import genome feature BEDfile and create a new column for gene length head(Cvir_genom_feats) #Confirm changes ``` ```{r} geneLengths <- cov_data_l[, c("gene_ids")] %>% unique(.) %>% merge(., Cvir_genom_feats[,c("gene_ids", "length")], by = "gene_ids") %>% dplyr::rename(gene_name = "gene_ids") %>% dplyr::select(., gene_name, length) %>% mutate(., gene_name = gsub(x = gene_name, pattern = "gene-", replacement = "")) #Add gene length to CoV data and retain only gene ID and length column. Rename columns to match naming conventions in this script and remove "gene-" prefix from names head(geneLengths) #Confirm formatting #Confirmm changes ``` ## Methylation data ```{r} geneMethylation <- read.csv("../40-gene-methylaiton.csv", header = TRUE, row.names = 2) geneMethylationMod <- geneMethylation %>% dplyr::select(., -1) #Remove extra column head(geneMethylationMod) ``` ```{r} geneMethylationFem <- geneMethylationMod %>% filter(., grepl("F", rownames(.), fixed = TRUE)) %>% t(.) %>% as.data.frame(.) %>% drop_na(.) %>% as.data.frame(.) %>% rownames_to_column(., var = "gene_name") %>% pivot_longer(., !gene_name, names_to = "sample", values_to = "meth") %>% mutate(., gene_name = gsub(pattern = "gene.", replacement = "", gene_name)) %>% left_join(., metadata, by = "sample") %>% dplyr::select(., c(gene_name, meth, treatment)) %>% dplyr::group_by(gene_name, treatment) %>% summarise(avg_meth = mean(meth, na.rm = TRUE)) %>% ungroup(.) #Filter female samples and reformat data. Remove excess information from gene name column. Add treatment information and average methylation for each gene by treatment. head(geneMethylationFem) #Confirm formatting. Resultant df has one line per gene/treatment. ``` ```{r} geneMethylationFemWide <- geneMethylationFem %>% pivot_wider(., names_from = treatment, names_prefix = "avg_meth_", values_from = avg_meth) %>% replace(is.na(.), 0) %>% mutate(., change_meth = avg_meth_exposed - avg_meth_control) #Pivot dataaframe wider and calculate change in gene methylation head(geneMethylationFemWide) ``` ```{r} geneMethylationMale <- geneMethylationMod %>% filter(., grepl("M", rownames(.), fixed = TRUE)) %>% t(.) %>% as.data.frame(.) %>% drop_na(.) %>% as.data.frame(.) %>% rownames_to_column(., var = "gene_name") %>% pivot_longer(., !gene_name, names_to = "sample", values_to = "meth") %>% mutate(., gene_name = gsub(pattern = "gene.", replacement = "", gene_name)) %>% left_join(., metadata, by = "sample") %>% dplyr::select(., c(gene_name, meth, treatment)) %>% dplyr::group_by(gene_name, treatment) %>% summarise(avg_meth = mean(meth, na.rm = TRUE)) %>% ungroup(.) #Filter male samples and reformat data. Remove excess information from gene name column. Add treatment information and average methylation for each gene by treatment. head(geneMethylationMale) #Confirm formatting ``` ```{r} geneMethylationMaleWide <- geneMethylationMale %>% pivot_wider(., names_from = treatment, names_prefix = "avg_meth_", values_from = avg_meth) %>% replace(is.na(.), 0) %>% mutate(., change_meth = avg_meth_exposed - avg_meth_control) #Pivot dataaframe wider and calculate change in gene methylation head(geneMethylationMaleWide) ``` ```{r} DMLLocationsFemale <- read.delim("../DML-characterization/female_dml-Gene-wb.bed", sep = "\t", header = FALSE, col.names = c("chr", "start", "end", "f_DML", "meth.diff", "gene.chr", "Gnomon", "gene", "gene.start", "gene.end", "V11", "strand", "V13", "V14")) %>% dplyr::select(., "chr", "start", "end", "meth.diff", "gene.start", "gene.end", "V14") %>% separate(., col = V14, into = c("gene_name", "misc"), sep = ";Dbxref=GeneID:") %>% dplyr::select(., -misc) %>% mutate(., gene_name = gsub(pattern = "ID=gene-", replacement = "", x = gene_name)) #Import bedfile with DML locations in genes with gene ID annotations. Rename columns as file is imported then select columns of interest. Separate gene ID annotation information from misc information, and remove misc column. Remove extra information before gene ID head(DMLLocationsFemale) #Confirm file import and changes ``` ```{r} DMLLocationsMale <- read.delim("../DML-characterization/male_dml-Gene-wb.bed", sep = "\t", header = FALSE, col.names = c("chr", "start", "end", "f_DML", "meth.diff", "gene.chr", "Gnomon", "gene", "gene.start", "gene.end", "V11", "strand", "V13", "V14")) %>% dplyr::select(., "chr", "start", "end", "meth.diff", "gene.start", "gene.end", "V14") %>% separate(., col = V14, into = c("gene_name", "misc"), sep = ";Dbxref=GeneID:") %>% dplyr::select(., -misc) %>% mutate(., gene_name = gsub(pattern = "ID=gene-", replacement = "", x = gene_name)) #Import bedfile with DML locations in genes with gene ID annotations. Rename columns as file is imported then select columns of interest. Separate gene ID annotation information from misc information, and remove misc column. Remove extra information before gene ID head(DMLLocationsMale) #Confirm file import and changes ``` ## Expression data ```{r} geneExpression <- read.csv("../../data/gene_fpkm.csv", header = TRUE, row.names = metadata$sample) geneExpressionMod <- geneExpression %>% dplyr::select(., -c(1:6)) #Remove extra information to have just FPKM for each gene head(geneExpressionMod) ``` ```{r} geneExpressionFem <- geneExpressionMod %>% filter(., grepl("F", rownames(.), fixed = TRUE)) %>% t(.) %>% as.data.frame(.) %>% drop_na(.) %>% as.data.frame(.) %>% rownames_to_column(., var = "gene_name") %>% pivot_longer(., cols = !gene_name, names_to = "sample", values_to = "exp") %>% mutate(., gene_name = gsub("gene.", "", gene_name)) %>% left_join(., metadata, by = "sample") %>% dplyr::select(., c(gene_name, exp, treatment)) %>% dplyr::group_by(gene_name, treatment) %>% summarise(avg_exp = mean(exp, na.rm = TRUE)) %>% ungroup(.) #Filter female samples and reformat data. Remove excess information from gene name column. Add treatment information and average expression for each gene by treatment. head(geneExpressionFem) #Confirm formatting ``` ```{r} geneExpressionFem %>% filter(., avg_exp != 0) %>% dplyr::select(., gene_name) %>% distinct(.) %>% nrow(.) ``` ```{r} geneExpressionFemWide <- geneExpressionFem %>% pivot_wider(., names_from = treatment, names_prefix = "avg_exp_", values_from = avg_exp) %>% replace(is.na(.), 0) %>% mutate(., change_exp = avg_exp_exposed - avg_exp_control) #Pivot dataaframe wider and calculate change in gene Expression head(geneExpressionFemWide) ``` ```{r} geneExpressionMale <- geneExpressionMod %>% filter(., grepl("M", rownames(.), fixed = TRUE)) %>% t(.) %>% as.data.frame(.) %>% drop_na(.) %>% as.data.frame(.) %>% rownames_to_column(., var = "gene_name") %>% pivot_longer(., !gene_name, names_to = "sample", values_to = "exp") %>% mutate(., gene_name = gsub("gene.", "", gene_name)) %>% left_join(., metadata, by = "sample") %>% dplyr::select(., c(gene_name, exp, treatment)) %>% dplyr::group_by(gene_name, treatment) %>% summarise(avg_exp = mean(exp, na.rm = TRUE)) %>% ungroup(.) #Filter male samples and reformat data. Remove excess information from gene name column. Add treatment information and average expression for each gene by treatment. head(geneExpressionMale) #Confirm formatting ``` ```{r} geneExpressionMale %>% filter(., avg_exp != 0) %>% dplyr::select(., gene_name) %>% unique(.) %>% nrow(.) ``` ```{r} geneExpressionMaleWide <- geneExpressionMale %>% pivot_wider(., names_from = treatment, names_prefix = "avg_exp_", values_from = avg_exp) %>% replace(is.na(.), 0) %>% mutate(., change_exp = avg_exp_exposed - avg_exp_control) #Pivot dataaframe wider and calculate change in gene Expression head(geneExpressionMaleWide) ``` ## Annotation data ```{r} mRNATrack <- read.delim("../../genome-features/C_virginica-3.0-mRNA.gff", header = FALSE) %>% separate(., V9, into = c("V10", "V11"), sep = "ID=rna-") %>% separate(., V11, into = c("V12", "V13"), sep = ";Parent=") %>% separate(., V13, into = c("V14", "V15"), sep = ";Dbxref=") %>% separate(., V15, into = c("V16", "V17"), sep = ";product=") %>% separate(., V17, into = c("V18", "V19"), sep = ";transcript_id=") %>% dplyr::select(., c("V12", "V14", "V18")) %>% dplyr::rename(., transcript = V12, gene = V14, product = V18) #Winnow down mRNA track mRNATrack$gene_name <- gsub("gene-", "", mRNATrack$gene) #Make gene naming consistent head(mRNATrack) #Confirm changes ``` ```{r} GOterms <- read.delim("../../genome-features/_blast-GO-unfolded.sorted", header = FALSE, col.names = c("GO", "transcript")) head(GOterms) ``` ```{r} GOslim <- read.delim("../../genome-features/GO-GOslim.sorted", header = FALSE, col.names = c("GO", "GOterm", "GOslim", "process")) head(GOslim) ``` # Females ## Maximum transcript change ```{r} nrow(female_tr) #39104 genes total sum(female_tr$difference > 0) #1529 genes with more transcripts expressed in exposed sum(female_tr$difference == 0) #35048 sum(female_tr$difference < 0) #2437 genes with less transcripts expressed in exposed sum(female_tr$difference > 0) / nrow(female_tr) * 100 #3.92% of genes have more transcripts expressed in exposed sum(female_tr$difference == 0) / nrow(female_tr) * 100 #89.83% of genes have more transcripts expressed in exposed sum(female_tr$difference < 0) / nrow(female_tr) * 100 #6.25% of genes have less transcripts expressed in exposed ``` ```{r} maxTransIncFemTest <- prop.test(x = 1529, n = 39104, p = 0.33, alternative = "two.sided", conf.level = 0.95) #Conduct a 1-sample z test maxTransIncFemTest maxTransIncFemTest$statistic #X-squared = 14965 maxTransIncFemTest$p.value #p = 0. Increases in max transcripts happen significantly less than what would be expected by random chance ``` ```{r} maxTransDecFemTest <- prop.test(x = 2437, n = 39104, p = 0.33, alternative = "two.sided", conf.level = 0.95) #Conduct a 1-sample z test maxTransDecFemTest maxTransDecFemTest$statistic #X-squared = 12671.25 maxTransDecFemTest$p.value #p = 0. Decreases in max transcripts happen significantly less than what would be expected by random chance ``` ```{r} maxTransFemTest <- prop.test(x = (1529 + 2437), n = 39104, p = 0.5, alternative = "two.sided", conf.level = 0.95) #Conduct a 1-sample z test maxTransFemTest maxTransFemTest$statistic #24847.36 maxTransFemTest$p.value #p = 0. Changes in max transcripts happen significantly less than what would be expected by random chance ``` ## Annotate data ```{r} maxTransFemAnnot <- inner_join(female_tr, mRNATrack, by = "gene_name") %>% inner_join(., GOterms, by = "transcript") %>% inner_join(., GOslim, by = "GO") #Join maximum transcript information with mRNA, GOterms, and GOSlim information head(maxTransFemAnnot) ``` ## Enrichment test I will use `topGO` to understand if there are biological processes overrepresented in the list of genes that had changes in the maximum number of transcripts expressed when compared to all genes with expression data. I will examine genes with an increase or decrease in maximum transcripts in exposed samples separate for enrichment. ### Make input files First thing I need to do is modify the GOterms list so each transcript ID has all matching GOterms in one line. I then need to pull out only the genes with data for > 1 transcript, since this will be the background list ```{r} maxTransFemAnnot %>% dplyr::select(., gene_name, GO) %>% unique(.) %>% group_by(., gene_name) %>% summarise(GO = paste(GO, collapse = ",")) %>% write.table(., "geneid2go-fem_maxTrans.tab", quote = FALSE, sep = "\t", col.names = FALSE, row.names = FALSE) #Take list of genes and GOterms for female maximum transcript data, and remove any duplicate entries (precaution). Using genes since that was the "input" for the maximum transcrpt analysis. Group by gene ID, then summarise using paste to get all GOterms for each transcript into the same row separated by ",". Save as a tab delimited file ``` ```{r} geneID2GO <- readMappings(file = "geneid2go-fem_maxTrans.tab") #Loading the GO annotations and GeneIDs. Each line has one transcript ID and all associated GOterms str(head(geneID2GO)) #Confirm file structure length(geneID2GO) #31,501 genes with annotations out of 39,104 genes with data for ≥ 2 transcripts ``` ```{r} geneNames <- names(geneID2GO) #Extract names to use as a gene universe head(geneNames) ``` ```{r} maxTransFemPos <- female_tr %>% filter(., difference > 0) %>% dplyr::select(., gene_name) #Take data and retain genes where the predominant isoform shifted (difference > 0). Select the gene ID column and convert to a dataframe. Save as a new object maxTransFemPos <- maxTransFemPos$gene_name #Save column as a vector (will not work otherwise!) ``` ```{r} maxTransFemPosGeneList <- factor(as.integer(geneNames %in% maxTransFemPos)) #Create a factor vector to indicate genes that increased transcripts as significant (1) and didn't as not significant (0) names(maxTransFemPosGeneList) <- geneNames str(maxTransFemPosGeneList) ``` ```{r} maxTransFemNeg <- female_tr %>% filter(., difference < 0) %>% dplyr::select(., gene_name) #Take data and retain genes where the predominant isoform shifted (difference < 0). Select the gene ID column and convert to a dataframe. Save as a new object maxTransFemNeg <- maxTransFemNeg$gene_name #Save column as a vector (will not work otherwise!) ``` ```{r} maxTransFemNegGeneList <- factor(as.integer(geneNames %in% maxTransFemNeg)) #Create a factor vector to indicate genes that increased transcripts as significant (1) and didn't as not significant (0) names(maxTransFemNegGeneList) <- geneNames str(maxTransFemNegGeneList) ``` ### Increases in maximum transcripts #### Run enrichment test I am only going to look at biological processes. ```{r} maxTransFemPosDataBP <- new("topGOdata", ontology = "BP", allGenes = maxTransFemPosGeneList, annot = annFUN.gene2GO, gene2GO = geneID2GO) #Create biological process topGO object maxTransFemPosDataBP #Get summary of object ``` ```{r} test.stat <- new("classicCount", testStatistic = GOFisherTest, name = "Fisher test") resultFisher.maxTransFemPosDataBP <- getSigGroups(maxTransFemPosDataBP, test.stat) resultFisher.maxTransFemPosDataBP ``` ```{r} pvalFis.maxTransFemPosDataBP <- score(resultFisher.maxTransFemPosDataBP) #Extract p-values head(pvalFis.maxTransFemPosDataBP) hist(pvalFis.maxTransFemPosDataBP, 50, xlab = "p-values") #Plot histogram of p-values ``` ```{r} allRes.maxTransFemPosDataBP <- GenTable(maxTransFemPosDataBP, classic = resultFisher.maxTransFemPosDataBP, ranksOf = "classic", orderBy = "classic", topNodes = length(pvalFis.maxTransFemPosDataBP)) #Create a statistical results table with statistical test results. Order by p-value (classic), and include all results (topNodes) head(allRes.maxTransFemPosDataBP) ``` ```{r} write.csv(allRes.maxTransFemPosDataBP, "fem-maxTransPos-BP-FisherTestResults.csv", quote = FALSE, row.names = FALSE) #Save dataframe ``` #### Match enriched GOterms with general annotation information ```{r} sigRes.maxTransFemPosDataBP <- allRes.maxTransFemPosDataBP[1:211,c(1, 6)] #Filter significantly enriched GOterms, only keep GOID and p-value colnames(sigRes.maxTransFemPosDataBP) <- c("GO", "p.value") #Change column names head(sigRes.maxTransFemPosDataBP) ``` ```{r} sigRes.maxTransFemPosDataBP %>% left_join(., y = maxTransFemAnnot, by = "GO") %>% filter(., difference > 0) %>% dplyr::select(., gene_name, difference, females_controls_max_transcript_counts, females_exposed_max_transcript_counts, GO, p.value, product) %>% separate(., col = product, into = c("product", "transcriptVar"), sep = "%2C") %>% dplyr::select(., -transcriptVar) %>% unique(.) %>% write.csv("fem-maxTransPos-BP-EnrichedGO-geneAnnot.csv", quote = FALSE, row.names = FALSE) #Take significant BPGO data and join with annotation information. Select columns of interest. Separate product column by "%2C" and remove transcriptVar column. Select unique rows. This removes all "transcript variant" information so each gene is listed once per significant GOterm. Save as a csv file. ``` ### Decreases in maximum transcripts #### Run enrichment test ```{r} maxTransFemNegDataBP <- new("topGOdata", ontology = "BP", allGenes = maxTransFemNegGeneList, annot = annFUN.gene2GO, gene2GO = geneID2GO) #Create biological process topGO object maxTransFemNegDataBP #Get summary of object ``` ```{r} test.stat <- new("classicCount", testStatistic = GOFisherTest, name = "Fisher test") resultFisher.maxTransFemNegDataBP <- getSigGroups(maxTransFemNegDataBP, test.stat) resultFisher.maxTransFemNegDataBP ``` ```{r} pvalFis.maxTransFemNegDataBP <- score(resultFisher.maxTransFemNegDataBP) #Extract p-values head(pvalFis.maxTransFemNegDataBP) hist(pvalFis.maxTransFemNegDataBP, 50, xlab = "p-values") #Plot histogram of p-values ``` ```{r} allRes.maxTransFemNegDataBP <- GenTable(maxTransFemNegDataBP, classic = resultFisher.maxTransFemNegDataBP, ranksOf = "classic", orderBy = "classic", topNodes = length(pvalFis.maxTransFemNegDataBP)) #Create a statistical results table with statistical test results. Order by p-value (classic), and include all results (topNodes) head(allRes.maxTransFemNegDataBP) ``` ```{r} write.csv(allRes.maxTransFemNegDataBP, "fem-maxTransNeg-BP-FisherTestResults.csv", quote = FALSE, row.names = FALSE) #Save dataframe ``` #### Match enriched GOterms with general annotation information ```{r} sigRes.maxTransFemNegDataBP <- allRes.maxTransFemNegDataBP[1:469,c(1, 6)] #Filter significantly enriched GOterms, only keep GOID and p-value colnames(sigRes.maxTransFemNegDataBP) <- c("GO", "p.value") #Change column names head(sigRes.maxTransFemNegDataBP) ``` ```{r} sigRes.maxTransFemNegDataBP %>% left_join(., y = maxTransFemAnnot, by = "GO") %>% filter(., difference < 0) %>% dplyr::select(., gene_name, difference, females_controls_max_transcript_counts, females_exposed_max_transcript_counts, GO, p.value, product) %>% separate(., col = product, into = c("product", "transcriptVar"), sep = "%2C") %>% dplyr::select(., -transcriptVar) %>% unique(.) %>% write.csv("fem-maxTransNeg-BP-EnrichedGO-geneAnnot.csv", quote = FALSE, row.names = FALSE) #Take significant BPGO data and join with annotation information. Select columns of interest. Separate product column by "%2C" and remove transcriptVar column. Select unique rows. This removes all "transcript variant" information so each gene is listed once per significant GOterm. Save as a csv file. ``` # Changes in maximum transcripts expressed and methylation ## Format data ```{r} maxTransFemWide <- inner_join(female_tr, geneMethylationFemWide, by = "gene_name") %>% inner_join(., geneExpressionFemWide, by = "gene_name") #Join maximum transcript expression data with gene methylation and expression data head(maxTransFemWide) ``` ## Methylation of genes with enriched BPGO ### Increases in maximum transcripts ```{r} maxTransFemPosAnnot <- sigRes.maxTransFemPosDataBP %>% left_join(., y = maxTransFemAnnot, by = "GO") %>% dplyr::select(., gene_name, difference, females_controls_max_transcript_counts, females_exposed_max_transcript_counts, GO, p.value, product) %>% separate(., col = product, into = c("product", "transcriptVar"), sep = "%2C") %>% dplyr::select(., -transcriptVar) %>% unique(.) %>% full_join(x = maxTransFemWide, y = ., by = "gene_name") %>% dplyr::select(., gene_name, difference.x, change_meth, change_exp, GO, p.value, product) %>% dplyr::rename(., difference = "difference.x") %>% filter(., difference > 0) #Take significant BPGO data and join with annotation information. Select columns of interest. Separate product column by "%2C" and remove transcriptVar column. Select unique rows. This removes all "transcript variant" information so each gene is listed once per significant GOterm. Full join with the wide maximum transcript dataframe that has change in methylation and expression data. Select relevant columns and rename change column. head(maxTransFemPosAnnot) #Confirm manipulation nrow(maxTransFemPosAnnot) ``` ```{r} femMaxTransPosEnrichedBPGO <- maxTransFemPosAnnot %>% filter(., is.na(GO) == FALSE) %>% ggplot(aes(x = change_meth)) + geom_histogram() + labs(x = "", y = "Number of Genes", title = "A. Enriched BPGO") + theme_classic(base_size = 15) femMaxTransPosNotEnrichedBPGO <- maxTransFemPosAnnot %>% filter(., is.na(GO) == TRUE) %>% ggplot(aes(x = change_meth)) + geom_histogram() + labs(x = "Change in Methylation (%)", y = "Number of Genes", title = "B. Not Enriched BPGO") + theme_classic(base_size = 15) ``` ```{r} femMaxTransPosEnrichedBPGO / femMaxTransPosNotEnrichedBPGO ggsave("fem-max-transcripts-positive-changeMeth-enrichedBPGO.pdf", height = 8.5, width = 11) ``` ### Decreases in maximum transcripts ```{r} maxTransFemNegAnnot <- sigRes.maxTransFemNegDataBP %>% left_join(., y = maxTransFemAnnot, by = "GO") %>% dplyr::select(., gene_name, difference, females_controls_max_transcript_counts, females_exposed_max_transcript_counts, GO, p.value, product) %>% separate(., col = product, into = c("product", "transcriptVar"), sep = "%2C") %>% dplyr::select(., -transcriptVar) %>% unique(.) %>% full_join(x = maxTransFemWide, y = ., by = "gene_name") %>% dplyr::select(., gene_name, difference.x, change_meth, change_exp, GO, p.value, product) %>% dplyr::rename(., difference = "difference.x") %>% filter(., difference < 0) #Take significant BPGO data and join with annotation information. Select columns of interest. Separate product column by "%2C" and remove transcriptVar column. Select unique rows. This removes all "transcript variant" information so each gene is listed once per significant GOterm. Full join with the wide maximum transcript dataframe that has change in methylation and expression data. Select relevant columns and rename change column. head(maxTransFemNegAnnot) #Confirm manipulation ``` ```{r} femMaxTransNegEnrichedBPGO <- maxTransFemNegAnnot %>% filter(., is.na(GO) == FALSE) %>% ggplot(aes(x = change_meth)) + geom_histogram() + labs(x = "", y = "Number of Genes", title = "A. Enriched BPGO") + theme_classic(base_size = 15) femMaxTransNegNotEnrichedBPGO <- maxTransFemNegAnnot %>% filter(., is.na(GO) == TRUE) %>% ggplot(aes(x = change_meth)) + geom_histogram() + labs(x = "Change in Methylation (%)", y = "Number of Genes", title = "B. Not Enriched BPGO") + theme_classic(base_size = 15) ``` ```{r} femMaxTransNegEnrichedBPGO / femMaxTransNegNotEnrichedBPGO ggsave("fem-max-transcripts-negative-changeMeth-enrichedBPGO.pdf", height = 8.5, width = 11) ``` ## Number of DML in genes with changes in the maximum transcripts expressed ```{r} maxTransFemWideDML <- left_join(maxTransFemWide, DMLLocationsFemale, by = "gene_name") %>% dplyr::select(., -c(chr, gene.start, gene.end)) %>% filter(., difference != 0) %>% group_by(., gene_name) %>% mutate(., DMLcount = n()) %>% mutate(., DMLcount = case_when(is.na(meth.diff) == TRUE ~ 0, is.na(meth.diff) == FALSE ~ DMLcount)) #Join max trans data with DML annotations and remove superfluous columns. Group by gene name and count the number of DML per gene. When there is no DML in the gene (meth.diff == NA), change the DML count to 0. head(maxTransFemWideDML) #Confirm dataframe creation ``` ```{r} write.csv(maxTransFemWideDML, "fem-max-trans-DML-annotations.csv", quote = FALSE, row.names = FALSE) #Save dataframe ``` ```{r} maxTransFemWideDML %>% group_by(., DMLcount, difference) %>% summarise(., count = n()) %>% write.csv(., "fem-max-trans-DML-counts.csv", quote = FALSE, row.names = FALSE) #Group data by DML count and change, then count the number of genes with or without predominant isoform shifts with various DML counts. Save as a csv ``` DMLcount difference count 0 -17 1 0 -16 1 0 -15 1 0 -13 1 0 -10 2 0 -9 3 0 -8 1 0 -7 3 0 -6 5 0 -5 10 0 -4 26 0 -3 72 0 -2 273 0 -1 1624 0 1 1067 0 2 106 0 3 15 0 4 5 0 5 1 0 7 1 1 -3 1 1 1 1 2 -2 4 2 -1 2 ## Plot data ```{r} plotColors <- RColorBrewer::brewer.pal(9, "Greens") ``` ```{r} maxTransFemLong <- maxTransFemWide %>% dplyr::select(., -c(females_controls_max_transcript_counts, females_exposed_max_transcript_counts, change_meth, change_exp)) %>% mutate(., difference_type = case_when(difference > 0 ~ "Increase", difference == 0 ~ "No Change", difference < 0 ~ "Decrease")) %>% pivot_longer(., cols = starts_with("avg_meth_"), names_to = "meth_treatment", values_to = "avg_meth") %>% pivot_longer(., cols = starts_with("avg_exp_"), names_to = "exp_treatment", values_to = "avg_exp") %>% mutate(., meth_treatment = gsub(x = meth_treatment, pattern = "avg_meth_", replacement = "")) %>% mutate(., exp_treatment = gsub(x = exp_treatment, pattern = "avg_exp_", replacement = "")) %>% filter(., meth_treatment == exp_treatment) %>% dplyr::select(., -exp_treatment) %>% dplyr::rename(., treatment = "meth_treatment") %>% mutate(., treatment = case_when(treatment == "control" ~ "Control", treatment == "exposed" ~ "Exposed")) #Remove transcript count and change in methylation/expressoin columns. Create a categorical column for difference in max transcripts. Pivot methylation data longer, then pivot expression data longer. Mutate treatment columns and filter rows where the methylation and expression treatments are the same. Remove the expression treatment column and rename the methylation treatment column head(maxTransFemLong) #Confirm formatting ``` ```{r} maxTransFemLong %>% ggplot(aes(x = avg_meth, y = log(avg_exp + 1), color = difference_type), alpha = 0.8) + geom_point() + geom_smooth(aes(x = avg_meth, y = log(avg_exp + 1)), method = lm, colour = "grey20", linewidth = 2, na.rm = TRUE) + stat_regline_equation(aes(x = avg_meth, y = log(avg_exp + 1), label = paste(..eq.label..)), formula = y ~ x, size = 5, inherit.aes = FALSE) + facet_grid(treatment~difference_type) + labs(x = "Methylation (%)", y = "log FPKM") + scale_color_manual(values = c(plotColors[5], plotColors[8], "grey")) + theme_classic(base_size = 15) + theme(legend.position = "none") ggsave("fem-max-trans-meth-exp-xy.pdf", height = 8.5, width = 11) ``` ```{r} maxTransFemWide %>% mutate(., difference_type = case_when(difference > 0 ~ "Increase", difference == 0 ~ "No Change", difference < 0 ~ "Decrease")) %>% ggplot(aes(x = change_meth, fill = difference_type)) + geom_histogram() + facet_grid(.~difference_type, scales = "free") + labs(x = "Change in Methylation", y = "Number of Genes") + scale_fill_manual(values = c(plotColors[5], plotColors[8], "grey"), labels = c("Increase", "Decrease", "No Change"), name = "") + theme_classic(base_size = 15) + theme(legend.position = "none") ggsave("fem-max-trans-meth-exp-distribution.pdf", height = 8.5, width = 11) ``` ```{r} maxTransFemWideDML %>% mutate(., difference_type = case_when(difference > 0 ~ "Increase", difference == 0 ~ "No Change", difference < 0 ~ "Decrease")) %>% dplyr::select(., -c(start, end, meth.diff)) %>% unique(.) %>% ggplot(aes(x = DMLcount, fill = difference_type)) + geom_histogram() + facet_wrap(.~difference_type, scales = "free_x") + labs(x = "Number of DML", y = "Number of Genes") + scale_fill_manual(values = c(plotColors[5], plotColors[8])) + theme_classic(base_size = 15) + theme(legend.position = "none") ggsave("fem-max-trans-DML-distribution.pdf", height = 8.5, width = 11) ``` ## Model data I want to understand if a change in maximum transcripts expressed due to pH treatment can be explained by a change in gene body methylation due to pH. I will also add number of DML in a gene, gene length (the longer the gene, the more potential for different isoforms/alternative start or stop sites) and change in gene expression due to pH (higher expression = higher likelihood of picking up signal associated with isoform change). I'm going to examine this is in two ways: 1) the difference in maximum transcripts expressed or 2) whether or not there was a change in the maximum number of transcripts. ### Format data ```{r} maxTransFemWideDMLLengths <- maxTransFemWideDML %>% dplyr::select(., -c(start, end, meth.diff)) %>% unique(.) %>% left_join(., geneLengths, by = "gene_name") %>% ungroup(.) %>% mutate(., difference_binary = case_when(difference == 0 ~ 0, difference != 0 ~ 1)) #Remove specific DML information and retain unique rows. Join with gene length information. Create binary variable for difference. head(maxTransFemWideDMLLengths) ``` ### Standard model ```{r} maxTransFemModel <- glm(difference ~ change_meth + DMLcount + log2(change_exp + 1) + log10(length), data = maxTransFemWideDMLLengths) #Model the difference in maximum transcripts based on change in methylation, DML count, change in expression, and gene length summary(maxTransFemModel) #Only length and gene expression significant ``` ```{r} write.csv(broom::tidy(maxTransFemModel), "fem-max-trans-model-output.csv", quote = FALSE, row.names = FALSE) #Save model output ``` ### Binomial model ```{r} maxTransFemModel2 <- glm(I(difference == 1) ~ change_meth + DMLcount + log10(length) + log2(change_exp + 1), family = binomial(), data = maxTransFemWideDMLLengths) #Create a binomial model, where 1 = the predominant isoform shifted when comparing low and control pH conditions. Use change in methylation, gene length, and change in gene expression as explanatory variables summary(maxTransFemModel2) #Only length and gene expression significant ``` ```{r} write.csv(broom::tidy(maxTransFemModel2), "fem-max-trans-binomial-model-output.csv", quote = FALSE, row.names = FALSE) #Save model output ``` # Males ## Maximum transcript change ```{r} nrow(male_tr) #39104 genes total sum(male_tr$difference > 0) #3123 genes with more transcripts expressed in exposed sum(male_tr$difference == 0) #34687 sum(male_tr$difference < 0) #1204 genes with less transcripts expressed in exposed sum(male_tr$difference > 0) / nrow(male_tr) * 100 #8.00% of genes have more transcripts expressed in exposed sum(male_tr$difference == 0) / nrow(male_tr) * 100 #88.9% of genes have more transcripts expressed in exposed sum(male_tr$difference < 0) / nrow(male_tr) * 100 #3.09% of genes have less transcripts expressed in exposed ``` ```{r} maxTransIncMaleTest <- prop.test(x = 3123, n = 39104, p = 0.33, alternative = "two.sided", conf.level = 0.95) #Conduct a 1-sample z test maxTransIncMaleTest maxTransIncMaleTest$statistic #X-squared = 11064.72 maxTransIncMaleTest$p.value #p = 0. Increases in max transcripts happen significantly less than what would be expected by random chance ``` ```{r} maxTransDecMaleTest <- prop.test(x = 1204, n = 39104, p = 0.33, alternative = "two.sided", conf.level = 0.95) #Conduct a 1-sample z test maxTransDecMaleTest maxTransDecMaleTest$statistic #X-squared = 15832.46 maxTransDecMaleTest$p.value #p = 0. Decreases in max transcripts happen significantly less than what would be expected by random chance ``` ```{r} maxTransMaleTest <- prop.test(x = (3123 + 1204), n = 39104, p = 0.5, alternative = "two.sided", conf.level = 0.95) #Conduct a 1-sample z test maxTransMaleTest maxTransMaleTest$statistic #23709.64 maxTransMaleTest$p.value #p = 0. Changes in max transcripts happen significantly less than what would be expected by random chance ``` ## Annotate data ```{r} maxTransMaleAnnot <- inner_join(male_tr, mRNATrack, by = "gene_name") %>% inner_join(., GOterms, by = "transcript") %>% inner_join(., GOslim, by = "GO") #Join with mRNA, GOterm, and GOSlim information head(maxTransMaleAnnot) ``` ## Enrichment test ## Make input files ```{r} maxTransMaleAnnot %>% dplyr::select(., gene_name, GO) %>% unique(.) %>% group_by(., gene_name) %>% summarise(GO = paste(GO, collapse = ",")) %>% write.table(., "geneid2go-male_maxTrans.tab", quote = FALSE, sep = "\t", col.names = FALSE, row.names = FALSE) #Take list of genes and GOterms for male maximum transcript data, and remove any duplicate entries (precaution). Using genes since that was the "input" for the maximum transcrpt analysis. Group by gene ID, then summarise using paste to get all GOterms for each transcript into the same row separated by ",". Save as a tab delimited file ``` ```{r} geneID2GO <- readMappings(file = "geneid2go-male_maxTrans.tab") #Loading the GO annotations and GeneIDs. Each line has one transcript ID and all associated GOterms str(head(geneID2GO)) #Confirm file structure length(geneID2GO) #31,501 genes with annotations out of 39,104 genes with data for ≥ 2 transcripts ``` ```{r} geneNames <- names(geneID2GO) #Extract names to use as a gene universe head(geneNames) ``` ```{r} maxTransMalePos <- male_tr %>% filter(., difference > 0) %>% dplyr::select(., gene_name) #Take data and retain genes where the predominant isoform shifted (difference > 0). Select the gene ID column and convert to a dataframe. Save as a new object maxTransMalePos <- maxTransMalePos$gene_name #Save column as a vector (will not work otherwise!) ``` ```{r} maxTransMalePosGeneList <- factor(as.integer(geneNames %in% maxTransMalePos)) #Create a factor vector to indicate genes that increased transcripts as significant (1) and didn't as not significant (0) names(maxTransMalePosGeneList) <- geneNames str(maxTransMalePosGeneList) ``` ```{r} maxTransMaleNeg <- male_tr %>% filter(., difference < 0) %>% dplyr::select(., gene_name) #Take data and retain genes where the predominant isoform shifted (difference < 0). Select the gene ID column and convert to a dataframe. Save as a new object maxTransMaleNeg <- maxTransMaleNeg$gene_name #Save column as a vector (will not work otherwise!) ``` ```{r} maxTransMaleNegGeneList <- factor(as.integer(geneNames %in% maxTransMaleNeg)) #Create a factor vector to indicate genes that increased transcripts as significant (1) and didn't as not significant (0) names(maxTransMaleNegGeneList) <- geneNames str(maxTransMaleNegGeneList) ``` ### Increases in maximum transcripts #### Run enrichment test ```{r} maxTransMalePosDataBP <- new("topGOdata", ontology = "BP", allGenes = maxTransMalePosGeneList, annot = annFUN.gene2GO, gene2GO = geneID2GO) #Create biological process topGO object maxTransMalePosDataBP #Get summary of object ``` ```{r} test.stat <- new("classicCount", testStatistic = GOFisherTest, name = "Fisher test") resultFisher.maxTransMalePosDataBP <- getSigGroups(maxTransMalePosDataBP, test.stat) resultFisher.maxTransMalePosDataBP ``` ```{r} pvalFis.maxTransMalePosDataBP <- score(resultFisher.maxTransMalePosDataBP) #Extract p-values head(pvalFis.maxTransMalePosDataBP) hist(pvalFis.maxTransMalePosDataBP, 50, xlab = "p-values") #Plot histogram of p-values ``` ```{r} allRes.maxTransMalePosDataBP <- GenTable(maxTransMalePosDataBP, classic = resultFisher.maxTransMalePosDataBP, ranksOf = "classic", orderBy = "classic", topNodes = length(pvalFis.maxTransMalePosDataBP)) #Create a statistical results table with statistical test results. Order by p-value (classic), and include all results (topNodes) head(allRes.maxTransMalePosDataBP) ``` ```{r} write.csv(allRes.maxTransMalePosDataBP, "male-maxTransPos-BP-FisherTestResults.csv", quote = FALSE, row.names = FALSE) #Save dataframe ``` #### Match enriched GOterms with general annotation information ```{r} sigRes.maxTransMalePosDataBP <- allRes.maxTransMalePosDataBP[1:391,c(1, 6)] #Filter significantly enriched GOterms, only keep GOID and p-value colnames(sigRes.maxTransMalePosDataBP) <- c("GO", "p.value") #Change column names head(sigRes.maxTransMalePosDataBP) ``` ```{r} sigRes.maxTransMalePosDataBP %>% left_join(., y = maxTransMaleAnnot, by = "GO") %>% filter(., difference > 0) %>% dplyr::select(., gene_name, difference, males_controls_max_transcript_counts, males_exposed_max_transcript_counts, GO, p.value, product) %>% separate(., col = product, into = c("product", "transcriptVar"), sep = "%2C") %>% dplyr::select(., -transcriptVar) %>% unique(.) %>% write.csv("male-maxTransPos-BP-EnrichedGO-geneAnnot.csv", quote = FALSE, row.names = FALSE) #Take significant BPGO data and join with annotation information. Select columns of interest. Separate product column by "%2C" and remove transcriptVar column. Select unique rows. This removes all "transcript variant" information so each gene is listed once per significant GOterm. Save as a csv file. ``` ### Decreases in maximum transcripts #### Run enrichment test ```{r} maxTransMaleNegDataBP <- new("topGOdata", ontology = "BP", allGenes = maxTransMaleNegGeneList, annot = annFUN.gene2GO, gene2GO = geneID2GO) #Create biological process topGO object maxTransMaleNegDataBP #Get summary of object ``` ```{r} test.stat <- new("classicCount", testStatistic = GOFisherTest, name = "Fisher test") resultFisher.maxTransMaleNegDataBP <- getSigGroups(maxTransMaleNegDataBP, test.stat) resultFisher.maxTransMaleNegDataBP ``` ```{r} pvalFis.maxTransMaleNegDataBP <- score(resultFisher.maxTransMaleNegDataBP) #Extract p-values head(pvalFis.maxTransMaleNegDataBP) hist(pvalFis.maxTransMaleNegDataBP, 50, xlab = "p-values") #Plot histogram of p-values ``` ```{r} allRes.maxTransMaleNegDataBP <- GenTable(maxTransMaleNegDataBP, classic = resultFisher.maxTransMaleNegDataBP, ranksOf = "classic", orderBy = "classic", topNodes = length(pvalFis.maxTransMaleNegDataBP)) #Create a statistical results table with statistical test results. Order by p-value (classic), and include all results (topNodes) head(allRes.maxTransMaleNegDataBP) ``` ```{r} write.csv(allRes.maxTransMaleNegDataBP, "male-maxTransNeg-BP-FisherTestResults.csv", quote = FALSE, row.names = FALSE) #Save dataframe ``` #### Match enriched GOterms with general annotation information ```{r} sigRes.maxTransMaleNegDataBP <- allRes.maxTransMaleNegDataBP[1:238,c(1, 6)] #Filter significantly enriched GOterms, only keep GOID and p-value colnames(sigRes.maxTransMaleNegDataBP) <- c("GO", "p.value") #Change column names head(sigRes.maxTransMaleNegDataBP) ``` ```{r} sigRes.maxTransMaleNegDataBP %>% left_join(., y = maxTransMaleAnnot, by = "GO") %>% filter(., difference < 0) %>% dplyr::select(., gene_name, difference, males_controls_max_transcript_counts, males_exposed_max_transcript_counts, GO, p.value, product) %>% separate(., col = product, into = c("product", "transcriptVar"), sep = "%2C") %>% dplyr::select(., -transcriptVar) %>% unique(.) %>% write.csv("male-maxTransNeg-BP-EnrichedGO-geneAnnot.csv", quote = FALSE, row.names = FALSE) #Take significant BPGO data and join with annotation information. Select columns of interest. Separate product column by "%2C" and remove transcriptVar column. Select unique rows. This removes all "transcript variant" information so each gene is listed once per significant GOterm. Save as a csv file. ``` # Changes in maximum transcripts expressed and methylation ## Format data ```{r} maxTransMaleWide <- inner_join(male_tr, geneMethylationMaleWide, by = "gene_name") %>% inner_join(., geneExpressionMaleWide, by = "gene_name") #Join maximum transcript expression data with gene methylation and expression data head(maxTransMaleWide) ``` ## Methylation of genes with enriched BPGO ### Increases in maximum transcripts ```{r} maxTransMalePosAnnot <- sigRes.maxTransMalePosDataBP %>% left_join(., y = maxTransMaleAnnot, by = "GO") %>% dplyr::select(., gene_name, difference, males_controls_max_transcript_counts, males_exposed_max_transcript_counts, GO, p.value, product) %>% separate(., col = product, into = c("product", "transcriptVar"), sep = "%2C") %>% dplyr::select(., -transcriptVar) %>% unique(.) %>% full_join(x = maxTransMaleWide, y = ., by = "gene_name") %>% dplyr::select(., gene_name, difference.x, change_meth, change_exp, GO, p.value, product) %>% dplyr::rename(., difference = "difference.x") %>% filter(., difference > 0) #Take significant BPGO data and join with annotation information. Select columns of interest. Separate product column by "%2C" and remove transcriptVar column. Select unique rows. This removes all "transcript variant" information so each gene is listed once per significant GOterm. Full join with the wide maximum transcript dataframe that has change in methylation and expression data. Select relevant columns and rename change column. head(maxTransMalePosAnnot) #Confirm manipulation ``` ```{r} maleMaxTransPosEnrichedBPGO <- maxTransMalePosAnnot %>% filter(., is.na(GO) == FALSE) %>% ggplot(aes(x = change_meth)) + geom_histogram() + labs(x = "", y = "Number of Genes", title = "A. Enriched BPGO") + theme_classic(base_size = 15) maleMaxTransPosNotEnrichedBPGO <- maxTransMalePosAnnot %>% filter(., is.na(GO) == TRUE) %>% ggplot(aes(x = change_meth)) + geom_histogram() + labs(x = "Change in Methylation (%)", y = "Number of Genes", title = "B. Not Enriched BPGO") + theme_classic(base_size = 15) ``` ```{r} maleMaxTransPosEnrichedBPGO / maleMaxTransPosNotEnrichedBPGO ggsave("male-max-transcripts-positive-changeMeth-enrichedBPGO.pdf", height = 8.5, width = 11) ``` ### Decreases in maximum transcripts ```{r} maxTransMaleNegAnnot <- sigRes.maxTransMaleNegDataBP %>% left_join(., y = maxTransMaleAnnot, by = "GO") %>% dplyr::select(., gene_name, difference, males_controls_max_transcript_counts, males_exposed_max_transcript_counts, GO, p.value, product) %>% separate(., col = product, into = c("product", "transcriptVar"), sep = "%2C") %>% dplyr::select(., -transcriptVar) %>% unique(.) %>% full_join(x = maxTransMaleWide, y = ., by = "gene_name") %>% dplyr::select(., gene_name, difference.x, change_meth, change_exp, GO, p.value, product) %>% dplyr::rename(., difference = "difference.x") %>% filter(., difference < 0) #Take significant BPGO data and join with annotation information. Select columns of interest. Separate product column by "%2C" and remove transcriptVar column. Select unique rows. This removes all "transcript variant" information so each gene is listed once per significant GOterm. Full join with the wide maximum transcript dataframe that has change in methylation and expression data. Select relevant columns and rename change column. head(maxTransMaleNegAnnot) #Confirm manipulation ``` ```{r} maleMaxTransNegEnrichedBPGO <- maxTransMaleNegAnnot %>% filter(., is.na(GO) == FALSE) %>% ggplot(aes(x = change_meth)) + geom_histogram() + labs(x = "", y = "Number of Genes", title = "A. Enriched BPGO") + theme_classic(base_size = 15) maleMaxTransNegNotEnrichedBPGO <- maxTransMaleNegAnnot %>% filter(., is.na(GO) == TRUE) %>% ggplot(aes(x = change_meth)) + geom_histogram() + labs(x = "Change in Methylation (%)", y = "Number of Genes", title = "B. Not Enriched BPGO") + theme_classic(base_size = 15) ``` ```{r} maleMaxTransNegEnrichedBPGO / maleMaxTransNegNotEnrichedBPGO ggsave("male-max-transcripts-negative-changeMeth-enrichedBPGO.pdf", height = 8.5, width = 11) ``` ## Number of DML in genes with changes in the maximum transcripts expressed ```{r} maxTransMaleWideDML <- left_join(maxTransMaleWide, DMLLocationsMale, by = "gene_name") %>% dplyr::select(., -c(chr, gene.start, gene.end)) %>% filter(., difference != 0) %>% group_by(., gene_name) %>% mutate(., DMLcount = n()) %>% mutate(., DMLcount = case_when(is.na(meth.diff) == TRUE ~ 0, is.na(meth.diff) == FALSE ~ DMLcount)) #Join max trans data with DML annotations and remove superfluous columns. Group by gene name and count the number of DML per gene. When there is no DML in the gene (meth.diff == NA), change the DML count to 0. head(maxTransMaleWideDML) #Confirm dataframe creation ``` ```{r} write.csv(maxTransMaleWideDML, "male-max-trans-DML-annotations.csv", quote = FALSE, row.names = FALSE) #Save dataframe ``` ```{r} maxTransMaleWideDML %>% group_by(., DMLcount, difference) %>% summarise(., count = n()) %>% write.csv(., "male-max-trans-DML-count.csv") #Group data by DML count and change, then count the number of genes with or without predominant isoform shifts with various DML counts. Save as a csv ``` DMLcount difference count 0 -7 1 0 -5 2 0 -4 3 0 -3 13 0 -2 87 0 -1 863 0 1 2134 0 2 325 0 3 72 0 4 26 0 5 13 0 6 3 0 7 3 0 8 1 0 13 1 1 -3 1 1 -2 7 1 -1 57 1 1 95 1 2 20 1 3 5 1 4 4 1 6 1 2 -2 2 2 -1 16 2 1 58 2 2 20 2 3 2 2 4 2 2 7 2 3 -3 3 3 -2 6 3 -1 18 3 1 21 3 2 9 3 3 3 3 4 3 4 -1 4 4 1 8 4 2 4 5 3 10 6 -2 6 7 3 7 9 1 18 13 1 13 14 -1 14 15 -1 15 17 -1 17 25 1 25 ## Plot data ```{r} maxTransMaleLong <- maxTransMaleWide %>% dplyr::select(., -c(males_controls_max_transcript_counts, males_exposed_max_transcript_counts, change_meth, change_exp)) %>% mutate(., difference_type = case_when(difference > 0 ~ "Increase", difference == 0 ~ "No Change", difference < 0 ~ "Decrease")) %>% pivot_longer(., cols = starts_with("avg_meth_"), names_to = "meth_treatment", values_to = "avg_meth") %>% pivot_longer(., cols = starts_with("avg_exp_"), names_to = "exp_treatment", values_to = "avg_exp") %>% mutate(., meth_treatment = gsub(x = meth_treatment, pattern = "avg_meth_", replacement = "")) %>% mutate(., exp_treatment = gsub(x = exp_treatment, pattern = "avg_exp_", replacement = "")) %>% filter(., meth_treatment == exp_treatment) %>% dplyr::select(., -exp_treatment) %>% dplyr::rename(., treatment = "meth_treatment") %>% mutate(., treatment = case_when(treatment == "control" ~ "Control", treatment == "exposed" ~ "Exposed")) #Remove transcript count and change in methylation/expressoin columns. Create a categorical column for difference in max transcripts. Pivot methylation data longer, then pivot expression data longer. Mutate treatment columns and filter rows where the methylation and expression treatments are the same. Remove the expression treatment column and rename the methylation treatment column head(maxTransMaleLong) #Confirm formatting ``` ```{r} maxTransMaleLong %>% ggplot(aes(x = avg_meth, y = log(avg_exp + 1), color = difference_type), alpha = 0.8) + geom_point() + geom_smooth(aes(x = avg_meth, y = log(avg_exp + 1)), method = lm, colour = "grey20", linewidth = 2, na.rm = TRUE) + stat_regline_equation(aes(x = avg_meth, y = log(avg_exp + 1), label = paste(..eq.label..)), formula = y ~ x, size = 5, inherit.aes = FALSE) + facet_grid(treatment~difference_type) + labs(x = "Methylation (%)", y = "log FPKM") + scale_color_manual(values = c(plotColors[5], plotColors[8], "grey")) + theme_classic(base_size = 15) + theme(legend.position = "none") ggsave("male-max-trans-meth-exp-xy.pdf", height = 8.5, width = 11) ``` ```{r} maxTransMaleWide %>% mutate(., difference_type = case_when(difference > 0 ~ "Increase", difference == 0 ~ "No Change", difference < 0 ~ "Decrease")) %>% ggplot(aes(x = change_meth, fill = difference_type)) + geom_histogram() + facet_grid(.~difference_type, scales = "free") + labs(x = "Change in Methylation", y = "Number of Genes") + scale_fill_manual(values = c(plotColors[5], plotColors[8], "grey"), labels = c("Increase", "Decrease", "No Change"), name = "") + theme_classic(base_size = 15) + theme(legend.position = "none") ggsave("male-max-trans-meth-exp-distribution.pdf", height = 8.5, width = 11) ``` ```{r} maxTransMaleWideDML %>% mutate(., difference_type = case_when(difference > 0 ~ "Increase", difference == 0 ~ "No Change", difference < 0 ~ "Decrease")) %>% dplyr::select(., -c(start, end, meth.diff)) %>% unique(.) %>% ggplot(aes(x = DMLcount, fill = difference_type)) + geom_histogram() + facet_wrap(.~difference_type, scales = "free_x") + labs(x = "Number of DML", y = "Number of Genes") + scale_fill_manual(values = c(plotColors[5], plotColors[8])) + theme_classic(base_size = 15) + theme(legend.position = "none") ggsave("male-max-trans-DML-distribution.pdf", height = 8.5, width = 11) ``` ## Model data ### Format data ```{r} maxTransMaleWideDMLLengths <- maxTransMaleWideDML %>% dplyr::select(., -c(start, end, meth.diff)) %>% unique(.) %>% left_join(., geneLengths, by = "gene_name") %>% ungroup(.) %>% mutate(., difference_type = case_when(difference > 0 ~ "Increase", difference == 0 ~ "No Change", difference < 0 ~ "Decrease")) %>% mutate(., difference_binary = case_when(difference == 0 ~ 0, difference != 0 ~ 1)) #Remove specific DML information and retain unique rows. Join with gene length information. Create other modifications, including a categorical variable for difference and a binary variable for difference. head(maxTransMaleWideDMLLengths) ``` ### Standard model ```{r} maxTransMaleModel <- glm(difference ~ change_meth + DMLcount + log2(change_exp + 1) + log10(length), data = maxTransMaleWideDMLLengths) #Model the difference in maximum transcripts based on change in methylation, DML count, change in expression, and gene length summary(maxTransMaleModel) #Only length and gene expression significant ``` ```{r} write.csv(broom::tidy(maxTransMaleModel), "male-max-trans-model-output.csv", quote = FALSE, row.names = FALSE) #Save model output ``` ### Binomial model ```{r} maxTransMaleModel2 <- glm(I(difference == 1) ~ change_meth + DMLcount + log10(length) + log2(change_exp + 1), family = binomial(), data = maxTransMaleWideDMLLengths) #Create a binomial model, where 1 = the predominant isoform shifted when comparing low and control pH conditions. Use change in methylation, gene length, and change in gene expression as explanatory variables summary(maxTransMaleModel2) #Only length and gene expression significant ``` ```{r} write.csv(broom::tidy(maxTransMaleModel2), "male-max-trans-binomial-model-output.csv", quote = FALSE, row.names = FALSE) #Save model output ``` # Create multipanel plots ## Maximum transcripts XY plot ```{r} femMaxTransXY <- maxTransFemLong %>% ggplot(aes(x = avg_meth, y = log(avg_exp + 1), color = difference_type), alpha = 0.8) + geom_point() + geom_smooth(aes(x = avg_meth, y = log(avg_exp + 1)), method = lm, colour = "grey20", linewidth = 2, na.rm = TRUE) + stat_regline_equation(aes(x = avg_meth, y = log(avg_exp + 1), label = paste(..eq.label..)), formula = y ~ x, size = 5, inherit.aes = FALSE) + facet_grid(treatment~difference_type) + labs(x = "", y = "log FPKM", title = "A. Females") + scale_color_manual(values = c(plotColors[5], plotColors[8], "grey")) + theme_classic(base_size = 15) + theme(legend.position = "none") ``` ```{r} maleMaxTransXY <- maxTransMaleLong %>% ggplot(aes(x = avg_meth, y = log(avg_exp + 1), color = difference_type), alpha = 0.8) + geom_point() + geom_smooth(aes(x = avg_meth, y = log(avg_exp + 1)), method = lm, colour = "grey20", linewidth = 2, na.rm = TRUE) + stat_regline_equation(aes(x = avg_meth, y = log(avg_exp + 1), label = paste(..eq.label..)), formula = y ~ x, size = 5, inherit.aes = FALSE) + facet_grid(treatment~difference_type) + labs(x = "Methylation (%)", y = "log FPKM", title = "B. Males") + scale_color_manual(values = c(plotColors[5], plotColors[8], "grey")) + theme_classic(base_size = 15) + theme(legend.position = "none") ``` ```{r} femMaxTransXY / maleMaxTransXY #Create patchwork plot ggsave("max-trans-meth-exp-xy.pdf", height = 8.5, width = 11) ``` ## Maximum transcript vs. change in methylation distribution ```{r} femMaxTransMethChange <- maxTransFemWide %>% mutate(., difference_type = case_when(difference > 0 ~ "Increase", difference == 0 ~ "No Change", difference < 0 ~ "Decrease")) %>% ggplot(aes(x = change_meth, fill = difference_type)) + geom_histogram() + facet_grid(.~difference_type, scales = "free") + labs(x = "", y = "Number of Genes", title = "A. Females") + scale_fill_manual(values = c(plotColors[5], plotColors[8], "grey"), labels = c("Increase", "Decrease", "No Change"), name = "") + theme_classic(base_size = 15) + theme(legend.position = "none") ``` ```{r} maleMaxTransMethChange <- maxTransMaleWide %>% mutate(., difference_type = case_when(difference > 0 ~ "Increase", difference == 0 ~ "No Change", difference < 0 ~ "Decrease")) %>% ggplot(aes(x = change_meth, fill = difference_type)) + geom_histogram() + facet_grid(.~difference_type, scales = "free") + labs(x = "Change in Methylation", y = "Number of Genes", title = "B. Males") + scale_fill_manual(values = c(plotColors[5], plotColors[8], "grey"), labels = c("Increase", "Decrease", "No Change"), name = "") + theme_classic(base_size = 15) + theme(legend.position = "none") ``` ```{r} femMaxTransMethChange / maleMaxTransMethChange ggsave("max-trans-change-meth-distribution.pdf", height = 8.5, width = 11) ```