--- title: "Calculating FPKM coefficients of variation across sex/treatments in C.virginica gonad exposed to elevated pCO2 using Ballgown" author: "Sam White" date: "2/4/2022" output: html_document --- # Use [Ballgown](https://github.com/alyssafrazee/ballgown) to extract gene and transcript FPKM values and then calculate coefficients of variation across samples within treatments/sex of _C.virginica_ gonad tissue exposed to elevated pCO2. REQUIRES Linux-based system to run all chunks properly; some chunks will not work on Mac OS! REQUIRES the following Bash programs: - `wget` - `tree` - `md5sum` REQUIRES the following R libraries: - [`Ballgown`](https://github.com/alyssafrazee/ballgown) (Bioconductor) - `tidyverse` ## Load `R` libraries ```{r} library("ballgown") library("tidyverse") library("Rfast") ``` ## Download Ballgown input files. Notebooks detailing their creation: - [FastQ trimming](https://robertslab.github.io/sams-notebook/2022/02/24/Trimming-Additional-20bp-from-C.virginica-Gonad-RNAseq-with-fastp-on-Mox.html) - [Genome indexing, and exon/splice sites with HISAT2](https://robertslab.github.io/sams-notebook/2021/07/20/Genome-Annotations-Splice-Site-and-Exon-Extractions-for-C.virginica-GCF_002022765.2-Genome-Using-Hisat2-on-Mox.html) - [Mapping and identificaion of isoforms with StingTie](https://robertslab.github.io/sams-notebook/2022/02/25/Transcript-Identification-and-Alignments-C.virginica-RNAseq-with-NCBI-Genome-GCF_002022765.2-Using-Hisat2-and-Stringtie-on-Mox.html) ```{bash} # Download Ballgown input files and directory structure wget \ --directory-prefix ../data/ballgown \ --recursive \ --no-check-certificate \ --continue \ --cut-dirs 2 \ --no-host-directories \ --no-parent \ --quiet \ --reject "input_fastqs_checksums.md5" \ --accept "*.ctab,*checksums.md5" https://gannet.fish.washington.edu/Atumefaciens/20220225_cvir_stringtie_GCF_002022765.2_isoforms/ ``` ## Verify checksums NOTE: Warnings are expected, as the checksums files have checksums for files that are not downloaded for this project. ```{bash} cd ../data/ballgown # Make a line line="-----------------------------------------------------------------------------------------------" # Set working directory wd=$(pwd) # Loop through directories and verify checksums for directory in */ do cd "${directory}" # Get sample name; strips trailing slash from directory name sample="${directory%/}" echo ${line} echo "${sample}" echo "" # Confirm checksums; sorts for easier reading md5sum --check "${sample}"_checksums.md5 | sort -V echo "" echo "${line}" echo "" cd ${wd} done # Show downloaded directories/files tree ``` ## Find Ballgown installation location ```{r} data_directory <- system.file('extdata', package='ballgown') # automatically finds ballgown's installation directory # examine data_directory: data_directory ``` ## Create Ballgown object ```{r} # Uses regular expression in samplePattern to find all pertinent folders # Load all measurement data bg <- ballgown(dataDir="../data/ballgown/", samplePattern='S.*[FM]', meas='all') bg ``` ## Download and filter metadata file Filtered metadata will be used to create a phenotype dataframe needed for Ballgown differential expression analysis. `TreatmentN` column explanation: control males = 1 control females = 2 exposed males = 3 exposed females = 4 ```{r create-dataframes-for-ballgown-pData} # Read in metadata file from URL sample_metadata_full <- read.csv("https://raw.githubusercontent.com/epigeneticstoocean/2018_L18-adult-methylation/main/data/adult-meta.csv") head(sample_metadata_full) # Subset metadata in preparation of creating pData data frame for Ballgown # Sort by OldSample.ID to ensure matches directory structure (required for Ballgown) sample_metadata_subset <- sample_metadata_full %>% select(OldSample.ID, Treatment, TreatmentN, Sex) %>% arrange(OldSample.ID) # View subsetted metadata head(sample_metadata_subset) ``` ## Load phenotype dataframe into Ballgown object ```{r load-phenotype-info-ballgown} # Load phenotype info into Ballgown pData(bg) <- sample_metadata_subset # Examine phenotype data as it exists in Ballgown phenotype_table <- pData(bg) head(phenotype_table) ``` ## Load all transcript expression data ```{r} # Expression data whole_tx_table <- texpr(bg, 'all') # Rename gene_names listed as a "." whole_tx_table <- whole_tx_table %>% mutate(gene_name = ifelse(gene_name == ".", t_name, gene_name)) head(whole_tx_table) ``` ## Load all gene expression data ```{r load-gene-expression-data} whole_gx_table <- gexpr(bg) head(whole_gx_table) ``` ## Calculate coefficients of variation `TreatmentN` column explanation: control males = 1 control females = 2 exposed males = 3 exposed females = 4 ```{r coefficients-of-variation} # Vector of treatments treatments <- c(1:4) # Vector of comparisons comparisons <- c("all", "f_v_m", "c_v_e") # Create vector of ballgown gene names ## Moves the gene ids from rownames to a column in a data frame ballgown_genes_df <- as.data.frame(gexpr(bg)) %>% rownames_to_column(var = "gene_id") ## Extracts gene ids genes <- ballgown_genes_df$gene_id # Declare lists transcripts_treatment_fpkm_list <- list() genes_treatment_fpkm_list <- list() # Remove data frame if it exists. # Prevent problem of re-running and accidentally appending more data to existing data frame below. rm(genes_control.vs.exposed_fpkm_CoV_df, genes_female.vs.male_fpkm_CoV_df, genes_treatment_fpkm_CoV_df, transcripts_control.vs.exposed_fpkm_CoV_df, transcripts_female.vs.male_fpkm_CoV_df, transcripts_treatment_fpkm_CoV_df) # Create list of FPKMs by treatment. # Ordered in the list as listed above for the "TreatmentN" column explanation. for (treatment in treatments) { # Create ballgown objects containing only data from individual treatments bg_transcripts <- ballgown::subset(bg, "TreatmentN == treatment", genomesubset = FALSE) # Create matrix containing gene FPKM from individual treatments bg_genes <- gexpr(ballgown::subset(bg, "TreatmentN == treatment", genomesubset = FALSE)) # Extract transcript FPKMs and add to list transcripts_treatment_fpkm_list[[treatment]] <- texpr(bg_transcripts) # Extract gene FPKMS and add to list genes_treatment_fpkm_list[[treatment]] <- bg_genes } for (comparison in comparisons) { # Create data frame with coefficients of variation for each Treatment group for (index in 1:length(transcripts_treatment_fpkm_list)) { if (comparison == "all") { # Set treatment column name treatment_col <- paste("Treatment", index, "_", "FPKM_Coeff_of_Var", sep = "") # Checks if data frame exists if (exists("transcripts_treatment_fpkm_CoV_df")) { # Add new column with coefficients of variation for each subsequent TreatmentN transcripts_treatment_fpkm_CoV_df$column <- rowcvs(transcripts_treatment_fpkm_list[[index]], unbiased = TRUE) } else # Creates initial data frame with coefficients of variation for FPKM in Treatment 1 transcripts_treatment_fpkm_CoV_df <- as.data.frame(rowcvs(transcripts_treatment_fpkm_list[[index]], unbiased = TRUE)) # Checks if data frame exists if (exists("genes_treatment_fpkm_CoV_df")) { # Add new column with coefficients of variation for each subsequent TreatmentN genes_treatment_fpkm_CoV_df$column <- rowcvs(genes_treatment_fpkm_list[[index]], unbiased = TRUE) } else # Creates initial data frame with coefficients of variation for FPKM in Treatment 1 genes_treatment_fpkm_CoV_df <- as.data.frame(rowcvs(genes_treatment_fpkm_list[[index]], unbiased = TRUE)) # Renames columns names(transcripts_treatment_fpkm_CoV_df)[index] <- treatment_col names(genes_treatment_fpkm_CoV_df)[index] <- treatment_col } # Treatments 2&4 (females) if (comparison == "f_v_m" & (index == "2" || index == "4")) { # If matrix doesn't exist, create it if (!exists("transcripts_females_matrix")) { transcripts_females_matrix <- transcripts_treatment_fpkm_list[[index]] # If matrix does exist, append column to matrix } else transcripts_females_matrix <- cbind(transcripts_treatment_fpkm_list[[index]]) # If matrix doesn't exist, create it if (!exists("genes_females_matrix")) { genes_females_matrix <- genes_treatment_fpkm_list[[index]] # If matrix does exist, append column to matrix } else genes_females_matrix <- cbind(genes_treatment_fpkm_list[[index]]) # Checks is data frame exists if (exists("transcripts_female.vs.male_fpkm_CoV_df")) { # Add new column with coefficients of variation for each subsequent TreatmentN transcripts_female.vs.male_fpkm_CoV_df$column <- rowcvs(transcripts_females_matrix, unbiased = TRUE) } else # Creates initial data frame with coefficients of variation for FPKM in Treatment 1 transcripts_female.vs.male_fpkm_CoV_df <- as.data.frame(rowcvs(transcripts_females_matrix, unbiased = TRUE)) # Checks is data frame exists if (exists("genes_female.vs.male_fpkm_CoV_df")) { # Add new column with coefficients of variation for each subsequent TreatmentN genes_female.vs.male_fpkm_CoV_df$column <- rowcvs(genes_females_matrix, unbiased = TRUE) } else # Creates initial data frame with coefficients of variation for FPKM in Treatment 1 genes_female.vs.male_fpkm_CoV_df <- as.data.frame(rowcvs(genes_females_matrix, unbiased = TRUE)) } # Treatments 1&3 (males) else if (comparison == "f_v_m" & (index == "1" || index == "3")) { # If matrix doesn't exist, create it if (!exists("transcripts_males_matrix")) { transcripts_males_matrix <- transcripts_treatment_fpkm_list[[index]] # If matrix does exist, append column to matrix } else transcripts_males_matrix <- cbind(transcripts_treatment_fpkm_list[[index]]) # Checks if data frame exists if (exists("transcripts_female.vs.male_fpkm_CoV_df")) { # Add new column with coefficients of variation for each subsequent TreatmentN transcripts_female.vs.male_fpkm_CoV_df$column <- rowcvs(transcripts_males_matrix, unbiased = TRUE) } else # Creates initial data frame with coefficients of variation for FPKM in Treatment 1 transcripts_female.vs.male_fpkm_CoV_df <- as.data.frame(rowcvs(transcripts_males_matrix, unbiased = TRUE)) # If matrix doesn't exist, create it if (!exists("genes_males_matrix")) { genes_males_matrix <- genes_treatment_fpkm_list[[index]] # If matrix does exist, append column to matrix } else genes_males_matrix <- cbind(genes_treatment_fpkm_list[[index]]) # Checks if data frame exists if (exists("genes_female.vs.male_fpkm_CoV_df")) { # Add new column with coefficients of variation for each subsequent TreatmentN genes_female.vs.male_fpkm_CoV_df$column <- rowcvs(genes_males_matrix, unbiased = TRUE) } else # Creates initial data frame with coefficients of variation for FPKM in Treatment 1 genes_female.vs.male_fpkm_CoV_df <- as.data.frame(rowcvs(genes_males_matrix, unbiased = TRUE)) } # Treatments 1&2 (control) else if (comparison == "c_v_e" & (index == "1" || index == "2")) { # If matrix doesn't exist, create it if (!exists("transcripts_controls_matrix")) { transcripts_controls_matrix <- transcripts_treatment_fpkm_list[[index]] # If matrix does exist, append column to matrix } else transcripts_controls_matrix <- cbind(transcripts_treatment_fpkm_list[[index]]) # Checks if data frame exists if (exists("transcripts_control.vs.exposed_fpkm_CoV_df")) { # Add new column with coefficients of variation for each subsequent TreatmentN transcripts_control.vs.exposed_fpkm_CoV_df$column <- rowcvs(transcripts_controls_matrix, unbiased = TRUE) } else # Creates initial data frame with coefficients of variation for FPKM in Treatment 1 transcripts_control.vs.exposed_fpkm_CoV_df <- as.data.frame(rowcvs(transcripts_controls_matrix, unbiased = TRUE)) # If matrix doesn't exist, create it if (!exists("genes_controls_matrix")) { genes_controls_matrix <- genes_treatment_fpkm_list[[index]] # If matrix does exist, append column to matrix } else genes_controls_matrix <- cbind(genes_treatment_fpkm_list[[index]]) # Checks if data frame exists if (exists("genes_control.vs.exposed_fpkm_CoV_df")) { # Add new column with coefficients of variation for each subsequent TreatmentN genes_control.vs.exposed_fpkm_CoV_df$column <- rowcvs(genes_controls_matrix, unbiased = TRUE) } else # Creates initial data frame with coefficients of variation for FPKM in Treatment 1 genes_control.vs.exposed_fpkm_CoV_df <- as.data.frame(rowcvs(genes_controls_matrix, unbiased = TRUE)) } # Treatments 3&4 (exposed) else if (comparison == "c_v_e" & (index == "3" || index == "4")) { # If matrix doesn't exist, create it if (!exists("transcripts_exposed_matrix")) { transcripts_exposed_matrix <- transcripts_treatment_fpkm_list[[index]] # If matrix does exist, append column to matrix } else transcripts_exposed_matrix <- cbind(transcripts_treatment_fpkm_list[[index]]) # Checks if data frame exists if (exists("transcripts_control.vs.exposed_fpkm_CoV_df")) { # Add new column with coefficients of variation for each subsequent TreatmentN transcripts_control.vs.exposed_fpkm_CoV_df$column <- rowcvs(transcripts_exposed_matrix, unbiased = TRUE) } else # Creates initial data frame with coefficients of variation for FPKM in Treatment 1 transcripts_control.vs.exposed_fpkm_CoV_df <- as.data.frame(rowcvs(transcripts_exposed_matrix, unbiased = TRUE)) # If matrix doesn't exist, create it if (!exists("genes_exposed_matrix")) { genes_exposed_matrix <- genes_treatment_fpkm_list[[index]] # If matrix does exist, append column to matrix } else genes_exposed_matrix <- cbind(genes_treatment_fpkm_list[[index]]) # Checks if data frame exists if (exists("genes_control.vs.exposed_fpkm_CoV_df")) { # Add new column with coefficients of variation for each subsequent TreatmentN genes_control.vs.exposed_fpkm_CoV_df$column <- rowcvs(genes_exposed_matrix, unbiased = TRUE) } else # Creates initial data frame with coefficients of variation for FPKM in Treatment 1 genes_control.vs.exposed_fpkm_CoV_df <- as.data.frame(rowcvs(genes_exposed_matrix, unbiased = TRUE)) } } } # Rename columns names(transcripts_female.vs.male_fpkm_CoV_df)[1] <- paste("males", "_", "FPKM_Coeff_of_Var", sep = "") names(transcripts_female.vs.male_fpkm_CoV_df)[2] <- paste("females", "_", "FPKM_Coeff_of_Var", sep = "") names(transcripts_control.vs.exposed_fpkm_CoV_df)[1] <- paste("controls", "_", "FPKM_Coeff_of_Var", sep = "") names(transcripts_control.vs.exposed_fpkm_CoV_df)[2] <- paste("exposed", "_", "FPKM_Coeff_of_Var", sep = "") names(genes_female.vs.male_fpkm_CoV_df)[1] <- paste("males", "_", "FPKM_Coeff_of_Var", sep = "") names(genes_female.vs.male_fpkm_CoV_df)[2] <- paste("females", "_", "FPKM_Coeff_of_Var", sep = "") names(genes_control.vs.exposed_fpkm_CoV_df)[1] <- paste("controls", "_", "FPKM_Coeff_of_Var", sep = "") names(genes_control.vs.exposed_fpkm_CoV_df)[2] <- paste("exposed", "_", "FPKM_Coeff_of_Var", sep = "") # Create gene ids column genes_control.vs.exposed_fpkm_CoV_df <- cbind(gene_ids = genes, genes_control.vs.exposed_fpkm_CoV_df) genes_female.vs.male_fpkm_CoV_df <- cbind(gene_ids = genes, genes_female.vs.male_fpkm_CoV_df) genes_treatment_fpkm_CoV_df <- cbind(gene_ids = genes, genes_treatment_fpkm_CoV_df) # Remove first row containing combo of unannotated transcripts genes_control.vs.exposed_fpkm_CoV_df <- genes_control.vs.exposed_fpkm_CoV_df[-1,] genes_female.vs.male_fpkm_CoV_df <- genes_female.vs.male_fpkm_CoV_df[-1,] genes_treatment_fpkm_CoV_df <- genes_treatment_fpkm_CoV_df[-1,] # Create transcript ID column transcripts_control.vs.exposed_fpkm_CoV_df <- transcripts_control.vs.exposed_fpkm_CoV_df %>% rownames_to_column(var = "t_id") transcripts_female.vs.male_fpkm_CoV_df <- transcripts_female.vs.male_fpkm_CoV_df %>% rownames_to_column(var = "t_id") transcripts_treatment_fpkm_CoV_df <- transcripts_treatment_fpkm_CoV_df %>% rownames_to_column(var = "t_id") # Check genes data frames head(genes_control.vs.exposed_fpkm_CoV_df) head(genes_female.vs.male_fpkm_CoV_df) head(genes_treatment_fpkm_CoV_df) # Check transcripts data frames head(transcripts_control.vs.exposed_fpkm_CoV_df) head(transcripts_female.vs.male_fpkm_CoV_df) head(transcripts_treatment_fpkm_CoV_df) # Write genes data frames to CSVs write.csv(genes_control.vs.exposed_fpkm_CoV_df, file = "../analyses/genes_control.vs.exposed_fpkm_CoV_df.csv", row.names = FALSE, quote = FALSE) write.csv(genes_female.vs.male_fpkm_CoV_df, file = "../analyses/genes_female.vs.male_fpkm_CoV_df.csv", row.names = FALSE, quote = FALSE) write.csv(genes_treatment_fpkm_CoV_df, file = "../analyses/genes_treatment_fpkm_CoV_df.csv", row.names = FALSE, quote = FALSE) # Write transcripts data frames to CSVs write.csv(transcripts_control.vs.exposed_fpkm_CoV_df, file = "../analyses/transcripts_control.vs.exposed_fpkm_CoV_df.csv", row.names = FALSE, quote = FALSE) write.csv(transcripts_female.vs.male_fpkm_CoV_df, file = "../analyses/transcripts_female.vs.male_fpkm_CoV_df.csv", row.names = FALSE, quote = FALSE) write.csv(transcripts_treatment_fpkm_CoV_df, file = "../analyses/transcripts_treatment_fpkm_CoV_df.csv", row.names = FALSE, quote = FALSE) ```