--- title: Identification of differentially expressed transcripts in P.verrucosa exposed to enriched nutrient supply. author: "Sam White" date: "2/24/2023" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # Use [Ballgown](https://github.com/alyssafrazee/ballgown) for identification of differentially expressed isoforms in _P.verrucosa_ exposed to elevated nutrient supplyu. REQUIRES Linux-based system to run all chunks properly; some chunks will not work on Mac OS! REQUIRES the following Bash programs: - `wget` - `tree` - `md5sum` REQUIRES the following R libraries: - [`Ballgown`](https://github.com/alyssafrazee/ballgown) (Bioconductor) - `tidyverse` # Load `R` libraries ```{r load-libraries} library("ballgown") library("tidyverse") library("ggplot2") ``` # Set user variables! ```{r set-variables} # Set maximum pvalue for isoform expression cutoff pvalue <- 0.05 # Set maximum qvalue (false discovery rate) for isoform expression cutoff qvalue <- 0.05 ``` # Download Ballgown input files. Notebooks detailing their creation: - [FastQ trimming](https://robertslab.github.io/sams-notebook/2023/02/15/FastQ-Trimming-and-QC-P.verrucosa-RNA-seq-Data-from-Danielle-Becker-in-Hollie-Putnam-Lab-Using-fastp-FastQC-and-MultiQC-on-Mox.html) - [Genome indexing, and exon/splice sites with HISAT2](https://robertslab.github.io/sams-notebook/2023/01/31/Genome-Indexing-P.verrucosa-v1.0-Assembly-with-HiSat2-on-Mox.html) - [Mapping and identificaion of isoforms with StingTie]() ```{bash download-ballgown-files} # Download Ballgown input files and directory structure wget \ --directory-prefix ../data/ballgown \ --recursive \ --no-check-certificate \ --continue \ --cut-dirs 2 \ --no-host-directories \ --no-parent \ --quiet \ --reject "input_fastqs_checksums.md5" \ --accept "*.ctab,*checksums.md5" https://gannet.fish.washington.edu/Atumefaciens/20230216-pver-stringtie-Pver_genome_assembly_v1.0-isoforms/ ``` # Verify checksums NOTE: Warnings are expected, as the checksums files have checksums for files that are not downloaded for this project. ```{bash checksum-verification} cd ../data/ballgown # Remove "top level" checksums file # Prevents pattern matching downstream rm checksums.md5 # Make a line line="-----------------------------------------------------------------------------------------------" # Set working directory wd=$(pwd) # Loop through directories and verify checksums for directory in */ do cd "${directory}" # Get sample name; strips trailing slash from directory name sample="${directory%/}" echo ${line} echo "${sample}" echo "" # Confirm checksums; sorts for easier reading md5sum --check "${sample}"_checksums.md5 | sort -V echo "" echo "${line}" echo "" cd ${wd} done # Show downloaded directories/files tree ``` # Read in _P.verrucosa_ genes BED file ```{r read-in-genes-BED} genes_BED <- read.table(file = "../data/Pver_genome_assembly_v1.0-valid.genes.bed") # Add BED column names for more clarity colnames(genes_BED) <- c("chr", "start", "end", "name", "score", "strand") head(genes_BED) ``` # Find Ballgown installation location ```{r} data_directory <- system.file('extdata', package='ballgown') # automatically finds ballgown's installation directory # examine data_directory: data_directory ``` # Create Ballgown object ```{r} # Uses regular expression in samplePattern to find all pertinent folders # Load all measurement data bg_all <- ballgown(dataDir="../data/ballgown/", samplePattern='[CE]*', meas='all') bg_all bg_controls <- ballgown(dataDir="../data/ballgown/", samplePattern='C.*', meas='all') bg_controls bg_enriched <- ballgown(dataDir="../data/ballgown/", samplePattern='E.*', meas='all') bg_enriched ``` # Download and filter metadata file Filtered metadata will be used to create a phenotype dataframe needed for Ballgown differential expression analysis. ```{r create-dataframes-for-ballgown-pData} # Read in metadata file sample_metadata_full <- read.csv("../data/metadata.RNAseq.csv") # View full metadata head(sample_metadata_full) # Organize metadata in preparation of creating pData data frame far Ballgown # Sort by "sample_id" to ensure matches directory structure (required for Ballgown) sample_metadata_full <- sample_metadata_full %>% select(fragment.ID, treatment, block, sample_id) %>% arrange(sample_id) # View organized metadata head(sample_metadata_full) ``` # Load ALL phenotype dataframe into Ballgown object ```{r} # Load phenotype info into Ballgown pData(bg_all) <- sample_metadata_full # Examine phenotype data as it exists in Ballgown phenotype_table <- pData(bg_all) head(phenotype_table) ``` ### Look at ALL exon info ```{r} # Exon info structure(bg_all)$exon ``` ### Look at ALL intron info ```{r} # Intron info structure(bg_all)$intron ``` ### Look at ALL transcript (isoform) info ```{r} # Transcript info structure(bg_all)$trans ``` # Load ALL transcript expression data ```{r load-all-transcript-expression-data} # Expression data whole_tx_table <- texpr(bg_all, 'all') # Rename gene_names listed as a "." whole_tx_table <- whole_tx_table %>% mutate(gene_name = ifelse(gene_name == ".", t_name, gene_name)) head(whole_tx_table) # FPKM values for all transcripts # Converts output to data frame transcript_fpkm <- as.data.frame(texpr(bg_all, 'FPKM')) # Put rownames into column for further manipulation transcript_fpkm <- rownames_to_column(transcript_fpkm, "t_id") head(transcript_fpkm) ``` ## Write all transcript data to files ```{r write-transcript-data-to-file} # Write all transcript FPKM data to file write.csv(transcript_fpkm, file = "../output/02-ballgown-differential_gene_expression/fpkm-all_transcripts.csv", quote = FALSE, row.names = FALSE) # Write whole_tx_table to file write.csv(whole_tx_table, file ="../output/02-ballgown-differential_gene_expression/whole_tx_table.csv", quote = FALSE, row.names = FALSE) ``` # Load ALL gene expression data ```{r load-gene-expression-data} whole_gx_table <- gexpr(bg_all) # Convet to data frame whole_gx_table <- as.data.frame(whole_gx_table) # Convert rownames to a column whole_gx_table <- rownames_to_column(whole_gx_table, "name") head(whole_gx_table) ``` ## Write ALL gene data to files ```{r write-gene-data-to-file} # Write whole_gx_table to file write.csv(whole_gx_table, file ="../output/02-ballgown-differential_gene_expression/whole_gx_table.csv", quote = FALSE, row.names = FALSE) ``` # Generate boxplots to compare FPKM across all samples ```{r boxplots-to-compare-FPKM-across-all-samples} # Load phenotype info into Ballgown pData(bg_all) <- sample_metadata_full # Pull all transcript expression values stored in FPKM measurement from ballgown object fpkm <- texpr(bg_all, meas = "FPKM") # Log transform data and add 1 to all values to artificially prevent log2(0). fpkm_df <- as.data.frame(log2(fpkm+1)) head(fpkm_df) # Rotate data frame # Creates a "treatment" column by pulling last character from each sample name (C/E) fpkm_df_pivot <- pivot_longer( fpkm_df, cols = starts_with("FPKM"), names_to = "library") %>% mutate(treatment = (str_sub(library, start = 6, end = 6) ) ) head(fpkm_df_pivot) # Sort data frame by treatment fpkm_df_pivot_sorted <- fpkm_df_pivot %>% arrange(treatment) head(fpkm_df_pivot_sorted) # Set unique library names as vector # Will be used to group boxplot by treatment fpkm_libraries_sorted_unique <- (unique(fpkm_df_pivot_sorted$library)) head(fpkm_libraries_sorted_unique) # Re-order data frame by treatment-sorted data fpkm_df_pivot$library <- factor(fpkm_df_pivot$library, levels = fpkm_libraries_sorted_unique) # Produce boxplots of FPKM for each library # Grouped by treatment ggplot(fpkm_df_pivot, aes(library, value, color = treatment)) + geom_boxplot() + ggtitle("Comparisons of transcript FPKM values across all libraries") + theme(plot.title = element_text(hjust = 0.5)) + ylab("FPKM") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) # Save boxplot as PDF ggsave(filename = "../output/02-ballgown-differential_gene_expression/figures//fpkm_c-vs-e_boxplot.pdf") ``` # Identify differentially expressed transcripts (DETs) between treatments Returns FoldChange, too (getFC = TRUE) ```{r DETs-controls-vs-enriched} # Set string describing comparison # Used for final printout in chunk comparison <- "controls vs. enriched" # Load phenotype info into Ballgown pData(bg_all) <- sample_metadata_full # Identify DETs DET_treatment_stat_results = stattest(bg_all, feature='transcript', meas='FPKM', covariate="treatment", getFC = TRUE) head(DET_treatment_stat_results) # Filter based on p-value and q-value DET_treatment_filtered_stat_results <- filter(DET_treatment_stat_results, pval <= pvalue & qval <= qvalue) head(DET_treatment_filtered_stat_results) # Merge with full table to get subset of just differentially expressed isoforms merged_DET_treatment_filtered <- merge(x = DET_treatment_filtered_stat_results, y = whole_tx_table, by.x = "id", by.y = "t_id") head(merged_DET_treatment_filtered) # Convert to BED coordinate system, which is 0-based start # and end EXCLUSIVE; so need to subtract 1 from start coordinates ONLY merged_DET_treatment_filtered <- merged_DET_treatment_filtered %>% mutate(start = start - 1) head(merged_DET_treatment_filtered) # Filter for controls up-regulated transcripts (i.e. fold-change < 1) # Convert to BED coordinate system, which is 0-based start # and end EXCLUSIVE; so need to subtract 1 from start coordinates ONLY DET_treatment_filtered_controls <- filter(merged_DET_treatment_filtered, fc < 1) %>% mutate(start = start - 1) head(DET_treatment_filtered_controls) # Filter for enriched up-regulated transcripts (i.e. fold-change > 1) # Convert to BED coordinate system, which is 0-based start # and end EXCLUSIVE; so need to subtract 1 from start coordinates ONLY DET_treatment_filtered_enriched <- filter(merged_DET_treatment_filtered, fc > 1) %>% mutate(start = start - 1) head(DET_treatment_filtered_enriched) # Count number of DETs. count_DET_treatment_filtered_stat_results <- nrow(DET_treatment_filtered_stat_results) # Print number of DETs cat("Number of", comparison, "DET with p-values and q-values <= ", pvalue, ":", count_DET_treatment_filtered_stat_results, "\n") # Count number of controls DETs. count_DET_treatment_filtered_controls <- nrow(DET_treatment_filtered_controls) # Print number of controls DETs. cat("Number of controls DET with p-values and q-values <= ", pvalue, ":", count_DET_treatment_filtered_controls, "\n") # Count number of enriched DETs. count_DET_treatment_filtered_enriched <- nrow(DET_treatment_filtered_enriched) # Print number of enriched DETs. cat("Number of enriched DETs with p-values and q-values <= ", pvalue, ":", count_DET_treatment_filtered_enriched, "\n") ``` ## Write controls vs enriched DET dataframes to files ```{r write-DETs-controls-vs-enriched-to-files} write.csv( merged_DET_treatment_filtered, file = "../output/02-ballgown-differential_gene_expression/DET_treatment_filtered_p-0.05_q-0.05.csv", quote = FALSE, row.names = FALSE) # Creates a BED file of all DETs and inserts necessary columns to create properly formatted BED file. write.table( ( merged_DET_treatment_filtered %>% select(chr, start, end, t_name, strand) %>% add_column( score = "0", # Inserts a "score" column and assigns a value of "0" to all rows. .before = "strand" ) ), file = "../output/02-ballgown-differential_gene_expression/DET_treatment_filtered_p-0.05_q-0.05.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE ) # Creates a BED file of only controls DETs and inserts necessary columns to create properly formatted BED file. write.table( ( DET_treatment_filtered_controls %>% select(chr, start, end, t_name, strand) %>% add_column( score = "0", # Inserts a "score" column and assigns a value of "0" to all rows. .before = "strand" ) ), file = "../output/02-ballgown-differential_gene_expression/DET_treatment_controls_filtered_p-0.05_q-0.05.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE ) # Creates a BED file of only enriched DETs and inserts necessary columns to create properly formatted BED file. write.table( ( DET_treatment_filtered_enriched %>% select(chr, start, end, t_name, strand) %>% add_column( score = "0", # Inserts a "score" column and assigns a value of "0" to all rows. .before = "strand" ) ), file = "../output/02-ballgown-differential_gene_expression/DET_treatment_enriched_filtered_p-0.05_q-0.05.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE ) ``` # Identify differentially expressed genes (DEGs) between treatment Returns FoldChange, too (getFC = TRUE) ```{r DEGs-controls-vs.-enriched} # Set string describing comparison # Used for final printout in chunk comparison <- "controls vs. enriched" # Load phenotype info into Ballgown pData(bg_all) <- sample_metadata_full # Determine DEGs with ballgown DEG_treatment_stat_results = stattest(bg_all, feature='gene', meas='FPKM', covariate="treatment", getFC = TRUE) head(DEG_treatment_stat_results) # Filter based on p-value and q-value DEG_treatment_filtered_stat_results <- filter(DEG_treatment_stat_results, pval <= pvalue & qval <= qvalue) head(DEG_treatment_filtered_stat_results) # Merge with full table to get subset of just DEGs # and their corresponding transcripts. merged_DEG_treatment_filtered <- merge(x = DEG_treatment_filtered_stat_results, y = whole_tx_table, by.x = "id", by.y = "gene_id") head(merged_DEG_treatment_filtered) # Merge with BED file to get DEGs WITHOUT associated transcripts merged_DEG_treatment_filtered_genes_only <- merge(x = DEG_treatment_filtered_stat_results, y = genes_BED, by.x = "id", by.y = "name") head(merged_DEG_treatment_filtered_genes_only) # Convert to BED coordinate system, which is 0-based start # and end EXCLUSIVE; so need to subtract 1 from start coordinates ONLY merged_DEG_treatment_filtered_ <- merged_DEG_treatment_filtered %>% mutate(start = start -1) head(merged_DEG_treatment_filtered) # Filter for controls up-regulated genes (i.e. fold-change < 1) DEG_treatment_filtered_controls <- filter(merged_DEG_treatment_filtered_genes_only, fc < 1) head(DEG_treatment_filtered_controls) # Filter for enriched up-regulated genes (i.e. fold-change < 1) DEG_treatment_filtered_enriched <- filter(merged_DEG_treatment_filtered_genes_only, fc > 1) head(DEG_treatment_filtered_enriched) # Count number of DEG. count_merged_DEG_treatment_filtered_genes_only <- nrow(merged_DEG_treatment_filtered_genes_only) # Print number of DEG cat("Number of ", comparison, " DEG with p-values and q-values <= ", pvalue, ":", count_merged_DEG_treatment_filtered_genes_only, "\n") # Count number of controls DEG. count_DEG_treatment_filtered_controls <- nrow(DEG_treatment_filtered_controls) # Print number of controls DEG. cat("Number of controls DEG with p-values and q-values <= ", pvalue, ":", count_DEG_treatment_filtered_controls, "\n") # Count number of enriched DEG. count_DEG_treatment_filtered_enriched <- nrow(DEG_treatment_filtered_enriched) # Print number of enriched DEG. cat("Number of enriched DEG with p-values and q-values <= ", pvalue, ":", count_DEG_treatment_filtered_enriched, "\n") ``` ## Write DEGs enriched vs. controls dataframes to files ```{r write-DEGs-controls-vs-enriched-to-files} # Write merged dataframe to CSV write.csv( merged_DEG_treatment_filtered, file = "../output/02-ballgown-differential_gene_expression/DEG_treatment_all_filtered_p-0.05_q-0.05.csv", quote = FALSE, row.names = FALSE) # Creates a BED file of all DEG. write.table( (merged_DEG_treatment_filtered_genes_only %>% select(chr, start, end, id, score, strand) ), file = "../output/02-ballgown-differential_gene_expression/DEG_treatment_all_filtered_p-0.05_q-0.05.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE ) # Creates a BED file of only controls DEGs. write.table( ( DEG_treatment_filtered_controls %>% select(chr, start, end, id, score, strand) ), file = "../output/02-ballgown-differential_gene_expression/DEG_treatment_controls_filtered_p-0.05_q-0.05.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE ) # Creates a BED file of only enriched DEGs. write.table( ( DEG_treatment_filtered_enriched %>% select(chr, start, end, id, score, strand) ), file = "../output/02-ballgown-differential_gene_expression/DEG_treatment_enriched_filtered_p-0.05_q-0.05.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE ) ```