--- title: "Identification of differentially expressed isoforms using Ballgown in _S.namaysuch_ non-parasitized/parasitized liver tissue in two different subspecies: lean and siscowet." author: "Sam White" date: "08/17/2022" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # Use [Ballgown](https://github.com/alyssafrazee/ballgown) for identification of differentially expressed isoforms in _S.namaysuch_ non-parasitized/parasitized liver tissue in two different subspecies: lean and siscowet. REQUIRES Linux-based system to run all chunks properly; some chunks will not work on Mac OS! REQUIRES the following Bash programs: - `wget` - `tree` - `md5sum` REQUIRES the following R libraries: - [`Ballgown`](https://github.com/alyssafrazee/ballgown) (Bioconductor) - `tidyverse` ## Load `R` libraries ```{r} library("ballgown") library("tidyverse") library("ggplot2") library("Rfast") ``` ## Set user variables! ```{r set-pval-qval-cutoffs} # Set maximum pvalue for isoform expression cutoff pvalue <- 0.05 # Set maximum qvalue (false discovery rate) for isoform expression cutoff qvalue <- 0.05 ``` ## Download Ballgown input files. Notebooks detailing their creation: - [FastQ trimming](https://robertslab.github.io/sams-notebook/2022/07/06/SRA-Data-S.namaycush-SRA-BioProject-PRJNA674328-Download-and-QC.html) - [Genome indexing, and exon/splice sites with HISAT2](https://robertslab.github.io/sams-notebook/2022/08/10/Splice-Site-Identification-S.namaycush-Liver-Parasitized-and-Non-Parasitized-SRA-RNAseq-Using-Hisat2-Stingtie-with-Genome-GCF_016432855.1.html) - [Mapping and identificaion of isoforms with StingTie](https://robertslab.github.io/sams-notebook/2022/08/10/Splice-Site-Identification-S.namaycush-Liver-Parasitized-and-Non-Parasitized-SRA-RNAseq-Using-Hisat2-Stingtie-with-Genome-GCF_016432855.1.html) ```{bash download-bgown-input-tables} # Download Ballgown input files and directory structure wget \ --directory-prefix ../data/ballgown \ --recursive \ --no-check-certificate \ --continue \ --cut-dirs 2 \ --no-host-directories \ --no-parent \ --quiet \ --reject "concatenated-fastq-checksums.md5,original-fastq-checksums.md5" \ --reject-regex "/multiqc_data/" \ --accept "*.ctab,*checksums.md5" https://gannet.fish.washington.edu/Atumefaciens/20220810-snam-hisat2-GCF_016432855.1_index-align-stringtie_isoforms/ ``` ```{bash cleanup} rm ../data/ballgown/checksums.md5 ``` ## Verify checksums NOTE: Warnings are expected, as the checksums files have checksums for files that are not downloaded for this project. ```{bash} cd ../data/ballgown pwd # Make a line line="-----------------------------------------------------------------------------------------------" # Set working directory wd=$(pwd) # Loop through directories and verify checksums for directory in */ do cd "${directory}" # Get sample name; strips trailing slash from directory name sample="${directory%/}" echo ${line} echo "${sample}" echo "" # Confirm checksums; sorts for easier reading md5sum --check "${sample}"-checksums.md5 | sort -V echo "" echo "${line}" echo "" cd ${wd} done # Show downloaded directories/files tree ``` ## Find Ballgown installation location ```{r set-ballgown-install-dir} data_directory <- system.file('extdata', package='ballgown') # automatically finds ballgown's installation directory # examine data_directory: data_directory ``` ## Read in _S.namaycush_ genes BED file ```{r read-in-genes-BED} genes_BED <- read.table(file = "../data/20220818-snam-GCF_016432855.1_SaNama_1.0_genes.bed") # Add BED column names for more clarity colnames(genes_BED) <- c("chr", "start", "end", "name", "score", "strand") head(genes_BED) ``` ## Create Ballgown object ```{r create-ballgown-objects} # Uses regular expression in samplePattern to find all pertinent folders # Load all measurement data bg <- ballgown(dataDir="../data/ballgown/", samplePattern='[NP]*', meas='all') bg bg.nonparasitized <- ballgown(dataDir="../data/ballgown/", samplePattern='NP*', meas='all') bg.nonparasitized bg.nonparasitized.lean <- ballgown(dataDir="../data/ballgown/", samplePattern='NPLL*', meas='all') bg.nonparasitized.lean bg.nonparasitized.siscowet <- ballgown(dataDir="../data/ballgown/", samplePattern='NPSL*', meas='all') bg.nonparasitized.siscowet bg.parasitized <- ballgown(dataDir="../data/ballgown/", samplePattern='P*', meas='all') bg.parasitized bg.parasitized.lean <- ballgown(dataDir="../data/ballgown/", samplePattern='PLL*', meas='all') bg.parasitized.lean bg.parasitized.siscowet <- ballgown(dataDir="../data/ballgown/", samplePattern='PSL*', meas='all') bg.parasitized.siscowet ``` ## Download and filter metadata file Filtered metadata will be used to create a phenotype dataframe needed for Ballgown differential expression analysis. Will use the 'strain` column. `strain` column explanation: lean parasitized = 1 lean nonparasitized = 2 siscowet parasitized = 3 siscowet nonparasitized = 4 ```{r create-dataframes-for-ballgwon-pData} # Read in metadata file from URL # Data is alreayd properly sorted (by library) to match directory structure - required by Ballgown. sample_metadata_full <- read.csv("https://raw.githubusercontent.com/RobertsLab/project-lake-trout/main/data/ballgown-metadata.csv") # View full metadata head(sample_metadata_full) # Create modified metadata for LEAN only # 1 = LEAN PARASTIZED # 2 = LEAN NONPARASITIZED sample_metadata_subset_lean_only <- sample_metadata_full %>% filter(strain == "1" | strain == "2") # Test to make sure expected number of rows (does NOT count header row) # Expect 12 rows; 6 samples from each grouping stopifnot(12 == nrow(sample_metadata_subset_lean_only)) head(sample_metadata_subset_lean_only) # Create modified metadata for SISCOWET only # 3 = SISCOWET PARASITIZED # 4 = SISCOWET NONPARASITIZED sample_metadata_subset_siscowet_only <- sample_metadata_full %>% filter(strain == "3" | strain == "4") # Test to make sure expected number of rows (does NOT count header row) # Expect 12 rows; 6 samples from each grouping stopifnot(12 == nrow(sample_metadata_subset_siscowet_only)) head(sample_metadata_subset_siscowet_only) # Create modified metadata for PARASITIZED (leand and siscowet combined) # 1 = LEAN PARASTIZED # 3 = SISCOWET PARASITIZED sample_metadata_subset_parasitized <- sample_metadata_full %>% filter(strain == "1" | strain == "3") # Test to make sure expected number of rows (does NOT count header row) # Expect 12 rows; 6 samples from each grouping stopifnot(12 == nrow(sample_metadata_subset_parasitized)) head(sample_metadata_subset_parasitized) # Create modified metadata for NONPARASITIZED (leand and siscowet combined) # 1 = LEAN NONPARASTIZED # 4 = SISCOWET NONPARASITIZED sample_metadata_subset_nonparasitized <- sample_metadata_full %>% filter(strain == "3" | strain == "4") # Test to make sure expected number of rows (does NOT count header row) # Expect 12 rows; 6 samples from each grouping stopifnot(12 == nrow(sample_metadata_subset_nonparasitized)) head(sample_metadata_subset_nonparasitized) ``` ## Load phenotype dataframe into Ballgown object ```{r load-phenotype-full} # Load phenotype info into Ballgown pData(bg) <- sample_metadata_full # Examine phenotype data as it exists in Ballgown phenotype_table <- pData(bg) head(phenotype_table) ``` ## Look at exon info ```{r all-exon-info} # Exon info structure(bg)$exon ``` ## Look at intron info ```{r all-intron-info} # Intron info structure(bg)$intron ``` ## Look at transcript (isoform) info ```{r all-transcript-info} # Transcript info structure(bg)$trans ``` ## Load all transcript expression data ```{r load-all-transcript-expression} # Expression data whole_tx_table <- texpr(bg, 'all') # Rename gene_names listed as a "." whole_tx_table <- whole_tx_table %>% mutate(gene_name = ifelse(gene_name == ".", t_name, gene_name)) head(whole_tx_table) # FPKM values for all transcripts # Converts output to data frame transcript_fpkm <- as.data.frame(texpr(bg, 'FPKM')) # Put rownames into column named "t_id" for further manipulation transcript_fpkm <- rownames_to_column(transcript_fpkm, "t_id") head(transcript_fpkm) ``` ## Load all gene expression data ```{r load-gene-expression-data} whole_gx_table <- gexpr(bg) head(whole_gx_table) ``` ```{r write-all-fpkm-to-file} write.csv(transcript_fpkm, file = "../data/fpkm-all_transcripts.csv", quote = FALSE, row.names = FALSE) ``` ```{r write-tx-table-to-file} # Write whole_tx_table to file write.csv(whole_tx_table, file ="../data/whole_tx_table.csv", quote = FALSE, row.names = FALSE) ``` ## Generate boxplots to compare FPKM across all samples ```{r boxplots-to-compare-FPKM-across-all-samples} # Load phenotype info into Ballgown pData(bg) <- sample_metadata_full # Pull all transcript expression values stored in FPKM measurement from ballgown object fpkm <- texpr(bg, meas = "FPKM") # Log transform data and add 1 to all values to artificially prevent log10(0). fpkm_df <- as.data.frame(log10(fpkm+1)) head(fpkm_df) # Rotate data frame # Creates a "subspecies" column by examining library name length # Parses out subspecies "L" or "S" depending on length of library name # L = lean # S = siscowet # Then, changes abbreviated subspecies to full, corresponding name fpkm_df_pivot <- pivot_longer( fpkm_df, cols = starts_with("FPKM"), names_to = "library") %>% mutate(subspecies = case_when( nchar(library) == 11 ~ str_sub(library, 8, 8), nchar(library) == 10 ~ str_sub(library, 7, 7) ) ) %>% mutate(subspecies = replace(subspecies, subspecies == "L", "lean")) %>% mutate(subspecies = replace(subspecies, subspecies == "S", "siscowet")) head(fpkm_df_pivot) # Sort data frame by subspecies fpkm_df_pivot_sorted <- fpkm_df_pivot %>% arrange(subspecies) head(fpkm_df_pivot_sorted) # Set unique library names as vector # Will be used to group boxplot by subspecies fpkm_libraries_sorted_unique <- (unique(fpkm_df_pivot_sorted$library)) head(fpkm_libraries_sorted_unique) # Re-order data frame by subspecies-sorted data fpkm_df_pivot$library <- factor(fpkm_df_pivot$library, levels = fpkm_libraries_sorted_unique) # Produce boxplots of FPKM for each library # Grouped by subspecies ggplot(fpkm_df_pivot, aes(library, value, color = subspecies)) + geom_boxplot() + ggtitle("Comparisons of transcript FPKM values across all libraries") + theme(plot.title = element_text(hjust = 0.5)) + ylab("FPKM (log10+1 transformed") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) ``` ```{r save-boxplot-all-fpkm} # Save boxplot as PDF ggsave(filename = "../figures/fpkm-lean-v-siscowet-boxplot.pdf") ``` ## Identify differentially expressed transcripts (DETs) between subspecies. ## Returns FoldChange, too (getFC = TRUE) ### THERE ARE NO SAMPLES WITH q-values <=0.05!!! ```{r DETs-lean-vs.-siscowet} # Set string describing comparison # Used for final printout in chunk comparison <- "lean vs. siscowet" # Load phenotype info into Ballgown pData(bg) <- sample_metadata_full # Identify DETs DET_subspecies_stat_results = stattest(bg, feature='transcript', meas='FPKM', covariate="subspecies", getFC = TRUE) head(DET_subspecies_stat_results) # Filter based on p-value and q-value DET_subspecies_filtered_stat_results <- filter(DET_subspecies_stat_results, pval <= pvalue & qval <= qvalue) head(DET_subspecies_filtered_stat_results) ``` ## Identify differentially expressed transcripts (DETs) between subspecies. ## Returns FoldChange, too (getFC = TRUE) ## Restricted to p-value filtering ONLY. q-values are all >=0.05!! ```{r DETs-lean-vs-siscowet-pval-only} # Filter based on p-value only DET_subspecies_pval_filtered_stat_results <- filter(DET_subspecies_stat_results, pval <= pvalue) head(DET_subspecies_pval_filtered_stat_results) # Merge with full table to get subset of just differentially expressed transcripts merged_DET_subspecies_pval_filtered <- merge(x = DET_subspecies_pval_filtered_stat_results, y = whole_tx_table, by.x = "id", by.y = "t_id") head(merged_DET_subspecies_pval_filtered) # Convert to BED coordinate system, which is 0-based start # and end EXCLUSIVE; so need to subtract 1 from start coordinates ONLY merged_DET_subspecies_pval_filtered_start_minus_1 <- merged_DET_subspecies_pval_filtered %>% mutate(start = start - 1) head(merged_DET_subspecies_pval_filtered_start_minus_1) # Filter for lean up-regulated transcripts (i.e. fold-change < 1) # Convert to BED coordinate system, which is 0-based start # and end EXCLUSIVE; so need to subtract 1 from start coordinates ONLY DET_subspecies_pval_filtered_lean <- filter(merged_DET_subspecies_pval_filtered, fc < 1) %>% mutate(start = start - 1) head(DET_subspecies_pval_filtered_lean) # Filter for siscowet up-regulated transcripts (i.e. fold-change > 1) # Convert to BED coordinate system, which is 0-based start # and end EXCLUSIVE; so need to subtract 1 from start coordinates ONLY DET_subspecies_pval_filtered_siscowet <- filter(merged_DET_subspecies_pval_filtered, fc > 1) %>% mutate(start = start - 1) head(DET_subspecies_pval_filtered_siscowet) # Count number of DET. count_DET_subspecies_pval_filtered_stat_results <- nrow(DET_subspecies_pval_filtered_stat_results) # Count number of lean DET. count_DET_subspecies_pval_filtered_lean <- nrow(DET_subspecies_pval_filtered_lean) # Count number of siscowet DET. count_DET_subspecies_pval_filtered_siscowet <- nrow(DET_subspecies_pval_filtered_siscowet) # Check to make sure total number of DEGs matches sum of each treatment compared DEGs. # Stops script if counts do not sum to total DEGs. stopifnot(count_DET_subspecies_pval_filtered_stat_results == sum(count_DET_subspecies_pval_filtered_lean, count_DET_subspecies_pval_filtered_siscowet ) ) # Print number of DET cat("Number of", comparison, "DET with p-values <= ", pvalue, ":", count_DET_subspecies_pval_filtered_stat_results, "\n") # Print number of lean DET cat("Number of lean DET with p-values <= ", pvalue, ":", count_DET_subspecies_pval_filtered_lean, "\n") # Print number of siscowet DET cat("Number of siscowet DET with p-values <= ", pvalue, ":", count_DET_subspecies_pval_filtered_siscowet, "\n") ``` ```{r write-DETs-lean-vs-siscowet-to-files} # Write merged dataframe to CSV write.csv( merged_DET_subspecies_pval_filtered, file = "../analyses/DET-subspecies-pval_filtered_p-0.05.csv", quote = FALSE, row.names = FALSE) # Write pval/qval filtered dataframe to CSV write.csv( DET_subspecies_filtered_stat_results, file = "../analyses/DET-subspecies-pval_qval_filtered_p-q-0.05.csv", quote = FALSE, row.names = FALSE ) # Creates a BED file of all DETs and inserts necessary columns to create properly formatted BED file. write.table( ( merged_DET_subspecies_pval_filtered_start_minus_1 %>% select(chr, start, end, t_name, strand) %>% add_column( score = "0", # Inserts a "score" column and assigns a value of "0" to all rows. .before = "strand" ) ), file = "../analyses/DET-subspecies-pval_filtered_p-0.05.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE ) # Creates a BED file of only lean DETs and inserts necessary columns to create properly formatted BED file. write.table( ( DET_subspecies_pval_filtered_lean %>% select(chr, start, end, t_name, strand) %>% add_column( score = "0", # Inserts a "score" column and assigns a value of "0" to all rows. .before = "strand" ) ), file = "../analyses/DET-subspecies-lean-pval_filtered_p-0.05.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE ) # Creates a BED file of only siscowet DETs and inserts necessary columns to create properly formatted BED file. write.table( ( DET_subspecies_pval_filtered_siscowet %>% select(chr, start, end, t_name, strand) %>% add_column( score = "0", # Inserts a "score" column and assigns a value of "0" to all rows. .before = "strand" ) ), file = "../analyses/DET-subspecies-siscowet-pval_filtered_p-0.05.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE ) ``` ## Identify differentially expressed GENES (DEGs) between subspecies. ## Returns FoldChange, too (getFC = TRUE) ### THERE ARE NO SAMPLES WITH q-values <=0.05!!! ```{r DEGs-lean-vs.-siscowet} # Set string describing comparison # Used for final printout in chunk comparison <- "lean vs. siscowet" # Load phenotype info into Ballgown pData(bg) <- sample_metadata_full # Identify DEGs DEG_subspecies_stat_results = stattest(bg, feature='gene', meas='FPKM', covariate="subspecies", getFC = TRUE) head(DEG_subspecies_stat_results) # Filter based on p-value and q-value DEG_subspecies_filtered_stat_results <- filter(DEG_subspecies_stat_results, pval <= pvalue & qval <= qvalue) head(DEG_subspecies_filtered_stat_results) # Count number of DEG. count_DEG_subspecies_filtered_stat_results <- nrow(DEG_subspecies_filtered_stat_results) # Print number of DEG cat("Number of", comparison, "DEG with p-values <=", pvalue, "&", "q-values <=", qvalue, ":", count_DEG_subspecies_filtered_stat_results, "\n") ``` ## Identify differentially expressed GENES (DEGs) between subspecies. ## Returns FoldChange, too (getFC = TRUE) ## Restricted to p-value filtering ONLY. q-values are all >=0.05!! ```{r DEGs-lean-vs-siscowet-pval-only} # Filter based on p-value only DEG_subspecies_pval_filtered_stat_results <- filter(DEG_subspecies_stat_results, pval <= pvalue) head(DEG_subspecies_pval_filtered_stat_results) # Merge with full table to get subset of just differentially expressed GENES merged_DEG_subspecies_pval_filtered <- merge(x = DEG_subspecies_pval_filtered_stat_results, y = whole_tx_table, by.x = "id", by.y = "gene_id") head(merged_DEG_subspecies_pval_filtered) # Merge with BED file to get DEGs WITHOUT associated transcripts merged_DEG_subspecies_pval_filtered_genes_only <- merge(x = DEG_subspecies_pval_filtered_stat_results, y = genes_BED, by.x = "id", by.y = "name") head(merged_DEG_subspecies_pval_filtered_genes_only) # Convert to BED coordinate system, which is 0-based start # and end EXCLUSIVE; so need to subtract 1 from start coordinates ONLY merged_DEG_subspecies_pval_filtered_genes_only <- merged_DEG_subspecies_pval_filtered_genes_only %>% mutate(start = start - 1) head(merged_DEG_subspecies_pval_filtered_genes_only) # Filter for lean up-regulated transcripts (i.e. fold-change < 1) # Convert to BED coordinate system, which is 0-based start # and end EXCLUSIVE; so need to subtract 1 from start coordinates ONLY DEG_subspecies_pval_filtered_genes_only_lean <- filter(merged_DEG_subspecies_pval_filtered_genes_only, fc < 1) %>% mutate(start = start - 1) head(DEG_subspecies_pval_filtered_genes_only_lean) # Filter for siscowet up-regulated transcripts (i.e. fold-change > 1) # Convert to BED coordinate system, which is 0-based start # and end EXCLUSIVE; so need to subtract 1 from start coordinates ONLY DEG_subspecies_pval_filtered_genes_only_siscowet <- filter(merged_DEG_subspecies_pval_filtered_genes_only, fc > 1) %>% mutate(start = start - 1) head(DEG_subspecies_pval_filtered_genes_only_siscowet) # Count number of DEG. count_DEG_subspecies_pval_filtered_stat_genes_only <- nrow(merged_DEG_subspecies_pval_filtered_genes_only) # Count number of lean DEG. count_DEG_subspecies_pval_filtered_genes_only_lean<- nrow(DEG_subspecies_pval_filtered_genes_only_lean) # Count number of siscowet DEG. count_DEG_subspecies_pval_filtered_genes_only_siscowet <- nrow(DEG_subspecies_pval_filtered_genes_only_siscowet) # Check to make sure total number of DEGs matches sum of each treatment compared DEGs. # Stops script if counts do not sum to total DEGs. stopifnot(count_DEG_subspecies_pval_filtered_stat_genes_only == sum(count_DEG_subspecies_pval_filtered_genes_only_lean, count_DEG_subspecies_pval_filtered_genes_only_siscowet ) ) # Print number of DEG cat("Number of", comparison, "DEG with p-values <= ", pvalue, ":", count_DEG_subspecies_pval_filtered_stat_genes_only, "\n") # Print number of lean DEG cat("Number of lean DEG with p-values <= ", pvalue, ":", count_DEG_subspecies_pval_filtered_genes_only_lean, "\n") # Print number of siscowet DEG cat("Number of siscowet DEG with p-values <= ", pvalue, ":", count_DEG_subspecies_pval_filtered_genes_only_siscowet, "\n") ``` ```{r write-DEGs-lean-vs-siscowet-to-files} # Write merged dataframe to CSV write.csv( merged_DEG_subspecies_pval_filtered_genes_only, file = "../analyses/DEG-subspecies-pval_filtered_p-0.05.csv", quote = FALSE, row.names = FALSE) # Write pval/qval filtered dataframe to CSV write.csv( DEG_subspecies_filtered_stat_results, file = "../analyses/DEG_subspecies-pval_qval_filtered_p-q-0.05.csv", quote = FALSE, row.names = FALSE ) # Creates a BED file of all DEGs and inserts necessary columns to create properly formatted BED file. write.table( ( merged_DEG_subspecies_pval_filtered_genes_only %>% select(chr, start, end, id, strand) %>% add_column( score = "0", # Inserts a "score" column and assigns a value of "0" to all rows. .before = "strand" ) ), file = "../analyses/DEG-subspecies-pval_filtered_p-0.05.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE ) # Creates a BED file of only lean DEGs and inserts necessary columns to create properly formatted BED file. write.table( ( DEG_subspecies_pval_filtered_lean %>% select(chr, start, end, id, strand) %>% add_column( score = "0", # Inserts a "score" column and assigns a value of "0" to all rows. .before = "strand" ) ), file = "../analyses/DEG-subspecies-lean-pval_filtered_p-0.05.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE ) # Creates a BED file of only siscowet DEGs and inserts necessary columns to create properly formatted BED file. write.table( ( DEG_subspecies_pval_filtered_siscowet %>% select(chr, start, end, id, strand) %>% add_column( score = "0", # Inserts a "score" column and assigns a value of "0" to all rows. .before = "strand" ) ), file = "../analyses/DEG-subspecies-siscowet-pval_filtered_p-0.05.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE ) ```