--- title: "Calculating transcript counts per gene per sample in C.virginica gonad exposed to elevated pCO2 using Ballgown" author: "Sam White" date: "1/27/2022" output: html_document --- # Uses all transcripts expression table, generated using [Ballgown](https://github.com/alyssafrazee/ballgown), for calculating transcript counts per gene per sample in _C.virginica_ gonad tissue exposed to elevated pCO2. REQUIRES the following R libraries: - `tidyverse` # Load `R` libraries ```{r} library("tidyverse") ``` # Set variables ```{r set-variables} # Vectors for subsetting samples by different groups all <- c("S13M", "S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S64M", "S6M", "S76F", "S7M", "S12M", "S22F", "S23M", "S29F", "S31M", "S35F", "S36F", "S3F", "S41F", "S48M", "S50F", "S59M", "S77F", "S9M") controls <- c("S13M", "S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S64M", "S6M", "S76F", "S7M") exposed <- c("S12M", "S22F", "S23M", "S29F", "S31M", "S35F", "S36F", "S3F", "S41F", "S48M", "S50F", "S59M", "S77F", "S9M") males_controls <- c("S13M", "S64M", "S6M", "S7M") males_exposed <- c("S12M", "S23M", "S31M", "S48M", "S59M", "S9M") females_controls <- c("S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S76F") females_exposed <- c("S22F", "S29F", "S35F", "S36F", "S3F", "S41F", "S50F", "S77F") # Vector of comparisons comparisons <- c("f_vs_m", "c_vs_e", "c.f_vs_e.f", "c.m_vs_e.m", "c.f_vs_c.m", "e.f_vs_e.m") # Vector of count calculations column names count_calcs <- c("sum_transcript_counts", "median_transcript_counts", "mean_transcript_counts", "max_transcript_counts", "min_transcript_counts", "std_dev_transcript_counts", "male_sum_transcript_counts", "male_median_transcript_counts" , "male_mean_transcript_counts", "male_max_transcript_counts", "male_min_transcript_counts", "male_std_dev_transcript_counts", "female_sum_transcript_counts", "female_median_transcript_counts", "female_mean_transcript_counts", "female_max_transcript_counts", "female_min_transcript_counts", "female_std_dev_transcript_counts", "controls_sum_transcript_counts", "controls_median_transcript_counts", "controls_mean_transcript_counts", "controls_max_transcript_counts", "controls_min_transcript_counts", "controls_std_dev_transcript_counts", "exposed_sum_transcript_counts", "exposed_median_transcript_counts", "exposed_mean_transcript_counts", "exposed_max_transcript_counts", "exposed_min_transcript_counts", "exposed_std_dev_transcript_counts", "females_controls_sum_transcript_counts", "females_controls_median_transcript_counts", "females_controls_mean_transcript_counts", "females_controls_max_transcript_counts", "females_controls_min_transcript_counts", "females_controls_std_dev_transcript_counts", "females_exposed_sum_transcript_counts", "females_exposed_median_transcript_counts", "females_exposed_mean_transcript_counts", "females_exposed_max_transcript_counts", "females_exposed_min_transcript_counts", "females_exposed_std_dev_transcript_counts", "males_controls_sum_transcript_counts", "males_controls_median_transcript_counts", "males_controls_mean_transcript_counts", "males_controls_max_transcript_counts", "males_controls_min_transcript_counts", "males_controls_std_dev_transcript_counts", "males_exposed_sum_transcript_counts", "males_exposed_median_transcript_counts", "males_exposed_mean_transcript_counts", "males_exposed_max_transcript_counts", "males_exposed_min_transcript_counts", "males_exposed_std_dev_transcript_counts" ) # Initialize lists of data frames list_transcript_counts_dfs <- list() list_transcript_max_diffs_dfs <- list() ``` # Define functions ```{r define-function(s)} #### Function for determining maximum transcripts per gene##### # Function accepts two data frames (df1 and df2), and a string (comparison) # Returns data frame containing: # gene_name # df1_max_transcript_counts # df2_max_transcript_counts # difference between max transcript counts diff_max_transcripts <- function(df1, df2, comparison) { # Select columns from first data frame df1.max <- df1 %>% select(gene_name, contains("max_transcript_counts")) # Get name of max transcript counts column df1.max.name <- df1.max %>% select(contains("max_transcript_counts")) %>% colnames() # Select columns from second data frame df2.max <- df2 %>% select(gene_name, contains("max_transcript_counts")) # Get name of max transcript counts column df2.max.name <- df2.max %>% select(contains("max_transcript_counts")) %>% colnames() # Join the two max transcripts data frames on gene_name df1max.df2max.joined <- left_join(df1.max, df2.max, by = "gene_name") # Calculate difference between df1.max and df2.max # Filter for only samples with different max transcript counts. # The get() function is required to use the string of the corresponding variable. df1max.df2max.joined %>% select(everything()) %>% mutate(difference = (get(df1.max.name) - get(df2.max.name))) %>% filter(difference != 0) } ######################################################################## ``` # Load all transcript expression data ```{r load-transcript-data} # Expression data whole_tx_table <- read.csv("../data/whole_tx_table.csv") head(whole_tx_table) # Rename gene_names listed as a "." to the associated transcript name whole_tx_table <- whole_tx_table %>% mutate(gene_name = ifelse(gene_name == ".", t_name, gene_name)) head(whole_tx_table) ``` # Count transcripts for each gene for each sample. A transcript is counted if it has an FPKM value > 0. ```{r count-transcripts-per-gene-per-sample} # Create table of transcript counts per gene per sample transcript_counts <- whole_tx_table %>% select(starts_with(c("gene_name", "FPKM"))) %>% group_by(gene_name) %>% summarise((across(everything(), ~sum(. > 0)))) head(transcript_counts) # Rename columns names(transcript_counts) <- gsub(x = names(transcript_counts), pattern = "FPKM", replacement = "transcript_counts") head(transcript_counts) ``` # Calculate max transcripts per gene across all samples ```{r max-transcripts-per-gene} max_transcripts_per_gene <- transcript_counts %>% rowwise() %>% mutate(transcripts.max = max(across(contains(all, ignore.case = FALSE)))) %>% select(ends_with(c("name", "max"))) head(max_transcripts_per_gene) ``` ## Write max transcripts to tab-delimited file ```{r write-max-transcripts-to-file} write.table(max_transcripts_per_gene, file ="../output/34-transcript-counts/transcripts-counts-max_per_gene.tab", quote = FALSE, row.names = FALSE, col.names = FALSE, sep = "\t") ``` # Transcript count calcs ## Read in objects This allows the objects to be quickly loaded instead of waiting to re-run time consuming calcs in chunk below. ```{r load-calculation-objects} # Initialize list of R objects loaded_objects <- list() # Define the directory path directory_path <- "../output/34-transcript-counts/" # Get a list of all .rds files in the directory file_list <- list.files(path = directory_path, pattern = "\\.rds$") # Load all RDS files using readRDS() and sapply loaded_objects <- sapply(file_list, function(file) readRDS(file.path(directory_path, file)), simplify = FALSE) # Now loaded_objects is a list containing all the loaded objects str(loaded_objects) ``` ## Calculations This will take a while to run. For faster access to these datasets, load the objects in the chunk above instead. ```{r perform-transcript-count-calcs} # Perform calculations transcript_counts_per_gene_per_sample <- transcript_counts %>% rowwise() %>% mutate( sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))), median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))), mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))), max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))), min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))), std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs))) ) # Male stats transcript_counts_per_gene_per_sample_males <- transcript_counts %>% rowwise() %>% select("gene_name", ends_with('M')) %>% mutate( male_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))), male_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))), male_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))), male_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))), male_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))), male_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs))) ) # Female stats transcript_counts_per_gene_per_sample_females <- transcript_counts %>% rowwise() %>% select("gene_name", ends_with('F')) %>% mutate( female_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))), female_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))), female_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))), female_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))), female_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))), female_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs))) ) # Controls stats transcript_counts_per_gene_per_sample_controls <- transcript_counts %>% rowwise() %>% select("gene_name", ends_with(controls)) %>% mutate( controls_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))), controls_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))), controls_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))), controls_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))), controls_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))), controls_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs))) ) # Exposed stats transcript_counts_per_gene_per_sample_exposed <- transcript_counts %>% rowwise() %>% select("gene_name", ends_with(exposed)) %>% mutate( exposed_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))), exposed_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))), exposed_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))), exposed_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))), exposed_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))), exposed_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs))) ) # Controls females stats transcript_counts_per_gene_per_sample_females_controls <- transcript_counts %>% rowwise() %>% select("gene_name", ends_with(females_controls)) %>% mutate( females_controls_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))), females_controls_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))), females_controls_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))), females_controls_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))), females_controls_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))), females_controls_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs))) ) # Exposed females stats transcript_counts_per_gene_per_sample_females_exposed <- transcript_counts %>% rowwise() %>% select("gene_name", ends_with(females_exposed)) %>% mutate( females_exposed_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))), females_exposed_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))), females_exposed_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))), females_exposed_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))), females_exposed_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))), females_exposed_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs))) ) # Controls males stats transcript_counts_per_gene_per_sample_males_controls <- transcript_counts %>% rowwise() %>% select("gene_name", ends_with(males_controls)) %>% mutate( males_controls_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))), males_controls_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))), males_controls_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))), males_controls_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))), males_controls_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))), males_controls_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs))) ) # Exposed males stats transcript_counts_per_gene_per_sample_males_exposed <- transcript_counts %>% rowwise() %>% select("gene_name", ends_with(males_exposed)) %>% mutate( males_exposed_sum_transcript_counts = sum(c_across(where(is.numeric) & -any_of(count_calcs))), males_exposed_median_transcript_counts = median(c_across(where(is.numeric) & -any_of(count_calcs))), males_exposed_mean_transcript_counts = mean(c_across(where(is.numeric) & -any_of(count_calcs))), males_exposed_max_transcript_counts = max(c_across(where(is.numeric) & -any_of(count_calcs))), males_exposed_min_transcript_counts = min(c_across(where(is.numeric) & -any_of(count_calcs))), males_exposed_std_dev_transcript_counts = sd(c_across(where(is.numeric) & -any_of(count_calcs))) ) # Add data frames to list # Wraps ls() with grep to allow for needed Perl regex (the "^(?!list).*" aspect) because # ls() doesn't support Perl regex # Regex excludes any results beginning with the word "list" list_transcript_counts_dfs <- mget(grep("^(?!list).*", ls(pattern = "transcript_counts_per_gene_per_sample"), value = TRUE, perl = TRUE)) head(transcript_counts) ``` ## Save objects This allows the objects to be quickly loaded instead of waiting to re-run time consuming calcs in above chunk. ```{r} # Write data frames to R objects in ../output/34-transcript-counts/ dir # Uses names of data frames as names of output files. sapply(names(list_transcript_counts_dfs), function(x) saveRDS(list_transcript_counts_dfs[[x]], file = file.path("../output/34-transcript-counts/", paste(x, "rds", sep=".")))) ``` ## Write transcript per gene counts to files. ```{r write-transcript-per-gene-counts-to-files} # Write data frames to CSVs in ../output/34-transcript-counts/ dir # Uses names of data frames as names of output files. sapply(names(list_transcript_counts_dfs), function(x) write.csv(list_transcript_counts_dfs[[x]], file = file.path("../output/34-transcript-counts/", paste(x, "csv", sep=".")), quote = FALSE, row.names = FALSE) ) ``` # Determine differences in max number of transcripts between comparisons ```{r calculate-difference-max-transcripts} for (comparison in comparisons) { if (comparison == "f_vs_m") { diffs.max.transcripts_per_gene.females.vs.males <- diff_max_transcripts(transcript_counts_per_gene_per_sample_females, transcript_counts_per_gene_per_sample_males, comparison) } else if (comparison == "c_vs_e") { diffs.max.transcripts_per_gene.controls.vs.exposed <- diff_max_transcripts(transcript_counts_per_gene_per_sample_controls, transcript_counts_per_gene_per_sample_exposed, comparison) } else if (comparison == "c.f_vs_e.f") { diffs.max.transcripts_per_gene.females_controls.vs.females_exposed <- diff_max_transcripts(transcript_counts_per_gene_per_sample_females_controls, transcript_counts_per_gene_per_sample_females_exposed, comparison) } else if (comparison == "c.m_vs_e.m") { diffs.max.transcripts_per_gene.males_controls.vs.males_exposed <- diff_max_transcripts(transcript_counts_per_gene_per_sample_males_controls, transcript_counts_per_gene_per_sample_males_exposed, comparison) } else if (comparison == "c.f_vs_c.m") { diffs.max.transcripts_per_gene.females_controls.vs.males_controls <- diff_max_transcripts(transcript_counts_per_gene_per_sample_females_controls, transcript_counts_per_gene_per_sample_males_controls, comparison) } else if (comparison == "e.f_vs_e.m") { diffs.max.transcripts_per_gene.females_exposed.vs.males_exposed <- diff_max_transcripts(transcript_counts_per_gene_per_sample_females_exposed, transcript_counts_per_gene_per_sample_males_exposed, comparison) } } # Add data frames to list # Wraps ls() with grep to allow for needed perl regex (the "^(?!list).*" aspect) because # ls() doesn't support perl regex # Regex excludes any results beginning with the word "list" list_transcript_max_diffs_dfs <- mget(grep("^(?!list).*", ls(pattern = "max.transcripts_per_gene"), value = TRUE, perl = TRUE) ) ``` ## Write max transcript diffs to files. ```{r write-transcript-per-gene-counts-to-files} # Write data frames to CSVs in ../output/34-transcript-counts/ dir # Uses names of data frames as names of output files. sapply(names(list_transcript_max_diffs_dfs), function(x) write.csv(list_transcript_max_diffs_dfs[[x]], file = file.path("../output/34-transcript-counts/", paste(x, "csv", sep=".")), quote = FALSE, row.names = FALSE) ) ``` # Create joint file of max transcript counts ```{r} # Initialize merged_df with the first data frame merged_df <- loaded_objects[["transcript_counts_per_gene_per_sample_controls.rds"]] # Loop through the remaining data frames for (df_name in names(loaded_objects)) { # Select columns containing "max_transcript_counts" from the current data frame max_columns <- grep("max_transcript_counts", names(loaded_objects[[df_name]]), value = TRUE) # Merge the current data frame into merged_df merged_df <- merge(merged_df, loaded_objects[[df_name]], by = "gene_name", all = TRUE, suffixes = c("", "")) # Add the columns containing "max_transcript_counts" from the current data frame to merged_df merged_df[max_columns] <- loaded_objects[[df_name]][max_columns] } # Remove columns appended with ".1" merged_df <- merged_df[, !grepl("\\.1$", names(merged_df))] # Keep only the gene_name column and columns containing "max_transcript_counts" merged_df <- merged_df[grep("gene_name|max", names(merged_df))] str(merged_df) # Select multiple columns by column name selected_cols <- c("gene_name", "females_controls_max_transcript_counts", "males_controls_max_transcript_counts", "females_exposed_max_transcript_counts", "males_exposed_max_transcript_counts", "max_transcript_counts" ) # Select the specified columns from merged_df selected_df <- merged_df %>% dplyr::select(all_of(selected_cols)) str(selected_df) ``` ## Write female control/exposed & male control/exposed max transcripts ```{r} # Write to CSV write.csv(selected_df, file = "../output/34-transcript-counts/max.transcripts.females-c-e.males-c-e.csv", quote = FALSE, row.names = FALSE ) ```