--- title: "Calculating transcript counts per gene per sample in C.virginica gonad exposed to elevated pCO2 using Ballgown" author: "Sam White" date: "1/27/2022" output: html_document --- # Use [Ballgown](https://github.com/alyssafrazee/ballgown) for calculating transcript counts per gene per sample in _C.virginica_ gonad tissue exposed to elevated pCO2. REQUIRES Linux-based system to run all chunks properly; some chunks will not work on Mac OS! REQUIRES the following Bash programs: - `wget` - `tree` - `md5sum` REQUIRES the following R libraries: - [`Ballgown`](https://github.com/alyssafrazee/ballgown) (Bioconductor) - `tidyverse` ## Load `R` libraries ```{r} library("ballgown") library("tidyverse") ``` ## Set variables ```{r set-varaibles} # Vectors for subsetting samples by different groups controls <- c("S13M", "S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S64M", "S6M", "S76F", "S7M") exposed <- c("S12M", "S22F", "S23M", "S29F", "S31M", "S35F", "S36F", "S3F", "S41F", "S48M", "S50F", "S59M", "S77F", "S9M") controls_males <- c("S13M", "S64M", "S6M", "S7M") exposed_males <- c("S12M", "S23M", "S31M", "S48M", "S59M", "S9M") controls_females <- c("S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S76F") exposed_females <- c("S22F", "S29F", "S35F", "S36F", "S3F", "S41F", "S50F", "S77F") # Vector of count calculations column names count_calcs <- c("sum_transcript_counts", "median_transcript_counts", "mean_transcript_counts", "max_transcript_counts", "min_transcript_counts", "std_dev_transcript_counts", "male_sum_transcript_counts", "male_median_transcript_counts" , "male_mean_transcript_counts", "male_max_transcript_counts", "male_min_transcript_counts", "male_std_dev_transcript_counts", "female_sum_transcript_counts", "female_median_transcript_counts", "female_mean_transcript_counts", "female_max_transcript_counts", "female_min_transcript_counts", "female_std_dev_transcript_counts", "controls_sum_transcript_counts", "controls_median_transcript_counts", "controls_mean_transcript_counts", "controls_max_transcript_counts", "controls_min_transcript_counts", "controls_std_dev_transcript_counts", "exposed_sum_transcript_counts", "exposed_median_transcript_counts", "exposed_mean_transcript_counts", "exposed_max_transcript_counts", "exposed_min_transcript_counts", "exposed_std_dev_transcript_counts", "control_females_sum_transcript_counts", "control_females_median_transcript_counts", "control_females_mean_transcript_counts", "control_females_max_transcript_counts", "control_females_min_transcript_counts", "control_females_std_dev_transcript_counts", "exposed_females_sum_transcript_counts", "exposed_females_median_transcript_counts", "exposed_females_mean_transcript_counts", "exposed_females_max_transcript_counts", "exposed_females_min_transcript_counts", "exposed_females_std_dev_transcript_counts", "control_males_sum_transcript_counts", "control_males_median_transcript_counts", "control_males_mean_transcript_counts", "control_males_max_transcript_counts", "control_males_min_transcript_counts", "control_males_std_dev_transcript_counts", "exposed_males_sum_transcript_counts", "exposed_males_median_transcript_counts", "exposed_males_mean_transcript_counts", "exposed_males_max_transcript_counts", "exposed_males_min_transcript_counts", "exposed_males_std_dev_transcript_counts" ) # Initialize list of data frames list_transcript_counts_dfs <- list() ``` ## Download Ballgown input files. Notebooks detailing their creation: - [FastQ trimming](https://robertslab.github.io/sams-notebook/2022/02/24/Trimming-Additional-20bp-from-C.virginica-Gonad-RNAseq-with-fastp-on-Mox.html) - [Genome indexing, and exon/splice sites with HISAT2](https://robertslab.github.io/sams-notebook/2021/07/20/Genome-Annotations-Splice-Site-and-Exon-Extractions-for-C.virginica-GCF_002022765.2-Genome-Using-Hisat2-on-Mox.html) - [Mapping and identificaion of isoforms with StingTie](https://robertslab.github.io/sams-notebook/2022/02/25/Transcript-Identification-and-Alignments-C.virginica-RNAseq-with-NCBI-Genome-GCF_002022765.2-Using-Hisat2-and-Stringtie-on-Mox.html) ```{bash} # Download Ballgown input files and directory structure wget \ --directory-prefix ../data/ballgown \ --recursive \ --no-check-certificate \ --continue \ --cut-dirs 2 \ --no-host-directories \ --no-parent \ --quiet \ --reject "input_fastqs_checksums.md5" \ --accept "*.ctab,*checksums.md5" https://gannet.fish.washington.edu/Atumefaciens/20220225_cvir_stringtie_GCF_002022765.2_isoforms/ ``` ## Verify checksums NOTE: Warnings are expected, as the checksums files have checksums for files that are not downloaded for this project. ```{bash} cd ../data/ballgown # Make a line line="-----------------------------------------------------------------------------------------------" # Set working directory wd=$(pwd) # Loop through directories and verify checksums for directory in */ do cd "${directory}" # Get sample name; strips trailing slash from directory name sample="${directory%/}" echo ${line} echo "${sample}" echo "" # Confirm checksums; sorts for easier reading md5sum --check "${sample}"_checksums.md5 | sort -V echo "" echo "${line}" echo "" cd ${wd} done # Show downloaded directories/files tree ``` ## Find Ballgown installation location ```{r} data_directory <- system.file('extdata', package='ballgown') # automatically finds ballgown's installation directory # examine data_directory: data_directory ``` ## Create Ballgown object ```{r} # Uses regular expression in samplePattern to find all pertinent folders # Load all measurement data bg <- ballgown(dataDir="../data/ballgown/", samplePattern='S.*[FM]', meas='all') bg ``` ## Download and filter metadata file Filtered metadata will be used to create a phenotype dataframe needed for Ballgown differential expression analysis. `TreatmentN` column explanation: control males = 1 control females = 2 exposed males = 3 exposed females = 4 ```{r create-dataframes-for-ballgown-pData} # Read in metadata file from URL sample_metadata_full <- read.csv("https://raw.githubusercontent.com/epigeneticstoocean/2018_L18-adult-methylation/main/data/adult-meta.csv") head(sample_metadata_full) # Subset metadata in preparation of creating pData data frame for Ballgown # Sort by OldSample.ID to ensure matches directory structure (required for Ballgown) sample_metadata_subset <- sample_metadata_full %>% select(OldSample.ID, Treatment, TreatmentN, Sex) %>% arrange(OldSample.ID) # View subsetted metadata head(sample_metadata_subset) ``` ## Load phenotype dataframe into Ballgown object ```{r load-phenotype-info-ballgown} # Load phenotype info into Ballgown pData(bg) <- sample_metadata_subset # Examine phenotype data as it exists in Ballgown phenotype_table <- pData(bg) head(phenotype_table) ``` ## Load all transcript expression data ```{r load-all-transcript-expression-data} # Expression data whole_tx_table <- texpr(bg, 'all') head(whole_tx_table) ``` ## Count transcripts for each gene for each sample. ## As transcript is counted if it has an FPKM value > 0. ```{r count-transcripts-per-gene-per-sample} # Rename gene_names listed as a "." whole_tx_table <- whole_tx_table %>% mutate(gene_name = ifelse(gene_name == ".", t_name, gene_name)) # Create table of transcript counts per gene per sample transcript_counts <- whole_tx_table %>% select(starts_with(c("gene_name", "FPKM"))) %>% group_by(gene_name) %>% summarise((across(everything(), ~sum(. > 0)))) head(transcript_counts) # Rename columns names(transcript_counts) <- gsub(x = names(transcript_counts), pattern = "FPKM", replacement = "transcript_counts") head(transcript_counts) # Perform calculations transcript_counts_per_gene_per_sample <- transcript_counts %>% rowwise() %>% mutate( sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))), median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))), mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))), max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))), min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))), std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs))) ) # Male stats transcript_counts_per_gene_per_sample_males <- transcript_counts %>% rowwise() %>% select("gene_name", ends_with('M')) %>% mutate( male_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))), male_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))), male_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))), male_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))), male_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))), male_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs))) ) # Female stats transcript_counts_per_gene_per_sample_females <- transcript_counts %>% rowwise() %>% select("gene_name", ends_with('F')) %>% mutate( female_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))), female_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))), female_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))), female_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))), female_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))), female_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs))) ) # Controls stats transcript_counts_per_gene_per_sample_controls <- transcript_counts %>% rowwise() %>% select("gene_name", ends_with(controls)) %>% mutate( controls_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))), controls_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))), controls_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))), controls_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))), controls_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))), controls_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs))) ) # Exposed stats transcript_counts_per_gene_per_sample_exposed <- transcript_counts %>% rowwise() %>% select("gene_name", ends_with(exposed)) %>% mutate( exposed_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))), exposed_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))), exposed_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))), exposed_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))), exposed_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))), exposed_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs))) ) # Controls females stats transcript_counts_per_gene_per_sample_controls_females <- transcript_counts %>% rowwise() %>% select("gene_name", ends_with(controls_females)) %>% mutate( control_females_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))), control_females_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))), control_females_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))), control_females_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))), control_females_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))), control_females_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs))) ) # Exposed females stats transcript_counts_per_gene_per_sample_exposed_females <- transcript_counts %>% rowwise() %>% select("gene_name", ends_with(exposed_females)) %>% mutate( exposed_females_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))), exposed_females_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))), exposed_females_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))), exposed_females_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))), exposed_females_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))), exposed_females_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs))) ) # Controls males stats transcript_counts_per_gene_per_sample_controls_males <- transcript_counts %>% rowwise() %>% select("gene_name", ends_with(controls_males)) %>% mutate( control_males_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))), control_males_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))), control_males_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))), control_males_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))), control_males_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))), control_males_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs))) ) # Exposed males stats transcript_counts_per_gene_per_sample_exposed_males <- transcript_counts %>% rowwise() %>% select("gene_name", ends_with(exposed_males)) %>% mutate( exposed_males_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))), exposed_males_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))), exposed_males_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))), exposed_males_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))), exposed_males_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))), exposed_males_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs))) ) # Add data frames to list # Wraps ls() with grep to allow for needed perl regex (the "^(?!list).*" aspect) because # ls() doesn't support perl regex # Regex excludes any results beginning with the word "list" list_transcript_counts_dfs <- mget(grep("^(?!list).*", ls(pattern = "transcript_counts_per_gene_per_sample"), value = TRUE, perl = TRUE)) head(transcript_counts) ``` ```{r write-transcript-per-gene-counts-to-files} # Write data frames to CSVs in ../analyses/dir # Uses names of data frames as names of output files. sapply(names(list_transcript_counts_dfs), function(x) write.csv(list_transcript_counts_dfs[[x]], file = file.path("../analyses/", paste(x, "csv", sep=".")), quote = FALSE, row.names = FALSE) ) ```