--- title: "Correlation between epigenetic proteins and physiology" author: "Jill Ashey" date: "2024-10-02" output: html_document --- Correlation between the epigenetic protein counts from the gene count matrix and the PC1 (calculated in 01-Physiology.Rmd). ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) library(tidyverse) library(readxl) library(reshape2) ``` ### Pocillopora This data is from [Becker et al. 2021](https://link.springer.com/article/10.1007/s00338-021-02138-2), which examined the effect of chronic low level nutrient enrichment on Pocillopora spp. The github for that project is [here](https://github.com/daniellembecker/Chronic_low_nutrient_enrichment_benefits_coral_thermal_performance_fore_reef_habitat). Read in gene count matrix. ```{r} count_matrix <- read.csv("../data/RNAseq/gene_count_matrix_filtered_poc.csv") %>% rename("gene_id" = "X") head(count_matrix) # Metadata for RNAseq rna_meta <- read.csv("../data/RNAseq/metadata.RNAseq_poc.csv") ``` The RNA and physiology samples do not have the same name, so the RNAseq metadata will be needed downstream. Read in blast results for Pverr against epigenetic proteins of interest ```{r} blast <- read.delim("../output/02-Pver-blast/Mach-blastp-Pver_out.tab", sep = "", header = F) colnames(blast) <- c("query_id", "subject_id", "percent_identity", "alignment_length", "mismatches", "gap_openings", "q_start", "q_end", "s_start", "s_end", "e_value", "bit_score") head(blast) ``` The gene names in the count matrix and the blast dfs are not the same; this is due to naming errors made by the authors of the [Pverr genome paper](https://academic.oup.com/gbe/article/12/10/1911/5898631?login=false#supplementary-data). They made a supplementary file that has both naming iterations, so this will be used to make sure the gene names are the same in each df Read in file with gene name iterations ```{r} names <- read_excel("../data/RNAseq/FileS2_Pver_gene_annot_May28.xlsx", skip = 4) %>% select(Query, Gene) names$Gene <- gsub("_gene", "", names$Gene) ``` Join names df with the blast df ```{r} blast_names <- blast %>% full_join(names, by = c("subject_id" = "Query")) ``` Join blast_names df with count matrix ```{r} count_blast <- blast_names %>% full_join(count_matrix, by = c("Gene" = "gene_id")) str(count_blast) ``` Manipulate data so that there is a sample column ```{r} reshaped_data <- count_blast %>% # Pivot longer to stack values pivot_longer(cols = C17:E9, names_to = "sample", values_to = "value") ``` Join with RNA metadata ```{r} reshaped_data <- reshaped_data %>% full_join(rna_meta, by = c("sample" = "sample_id")) ``` Read in phys PC data ```{r} phys_pc <- read.csv("../data/01-Physiology/phys_data_pc_poc.csv") ``` Join phys data with reshaped data df ```{r} count_phys <- phys_pc %>% full_join(reshaped_data, by = "fragment.ID") %>% na.omit() count_phys$value <- as.numeric(count_phys$value) ``` ID outliers ```{r} identify_outliers <- function(x) { Q1 <- quantile(x, 0.25) Q3 <- quantile(x, 0.75) IQR <- Q3 - Q1 lower_bound <- Q1 - 1.5 * IQR upper_bound <- Q3 + 1.5 * IQR return(x < lower_bound | x > upper_bound) } # Flag and sum outliers by group count_phys_outliers <- count_phys %>% group_by(query_id) %>% mutate(is_outlier = identify_outliers(value)) #%>% #summarise(total_outliers = sum(is_outlier)) # Remove outliers count_phys_no_outliers <- count_phys_outliers %>% filter(is_outlier == FALSE) ``` Run correlation ```{r} correlation_results_no_outliers <- count_phys_no_outliers %>% group_by(query_id) %>% summarize( correlation = list(cor.test(PC1, value))) %>% mutate( estimate = map_dbl(correlation, ~ .$estimate), p.value = map_dbl(correlation, ~ .$p.value) ) %>% select(query_id, estimate, p.value) %>% filter(p.value <= 0.05) %>% arrange(desc(estimate)) %>% #filter(estimate > 0) %>% mutate(query_id = factor(query_id, levels = query_id[order(estimate)])) %>% filter(!query_id =="USP53-201") # remove duplicate protein write.csv(correlation_results_no_outliers, "../output/03-EpigeneticProteins-Phys-Correlation/epi_protein_correlation_poc.csv") ``` Subset df by significant correlation and plot ```{r} sig_prot <- correlation_results_no_outliers$query_id # Subset by significant correlations subset_count_phys_no_outliers <- count_phys_no_outliers %>% filter(query_id %in% sig_prot) %>% full_join(correlation_results_no_outliers, by = "query_id") ggplot(subset_count_phys_no_outliers, aes(x = PC1, y = value, color = query_id)) + geom_point(size = 3) + geom_smooth(method = "lm", se = FALSE, linetype = "solid", color = "black") + geom_text(data = subset_count_phys_no_outliers[!duplicated(subset_count_phys_no_outliers$query_id), ], aes(label = paste("Estimate:", round(estimate, 2))), x = Inf, y = Inf, hjust = 1.1, vjust = 1.5, size = 3, inherit.aes = FALSE) + #scale_color_manual(values = c("Significant" = "red", "Not Significant" = "black")) + facet_wrap(~query_id, scales = "free") + theme(legend.position = "none") ``` ### Acropora pulchra Data is from Danielle's 2022 experiment, which examined the physiological and molecular effects of a simulated marine heatwave on Acropora pulchra. The github for that project is [here](https://github.com/daniellembecker/A.pul_Heatwave/tree/master). Read in gene count matrix. ```{r} # Count matrix from genome count_matrix <- read.csv("../data/RNAseq/Apul_gene_count_matrix.csv") # Rename columns colnames(count_matrix)[-1] <- gsub("^X", "ACR", colnames(count_matrix)[-1]) # Replace 'X' with 'ACR' colnames(count_matrix)[-1] <- gsub("_sorted\\.bam\\.gtf", "", colnames(count_matrix)[-1]) # Remove '_sorted.bam.gtf' # RNA meta ``` Read in blast results for Pverr against epigenetic proteins of interest ```{r} blast_apul <- read.delim("../output/08-Apul-epimods-blast/Mach-tblastn-Apul_out.tab", sep = "", header = F) colnames(blast_apul) <- c("query_id", "subject_id", "percent_identity", "alignment_length", "mismatches", "gap_openings", "q_start", "q_end", "s_start", "s_end", "e_value", "bit_score") head(blast_apul) # Create a new column to extract the desired part of subject_id - needed to merge with gff blast_apul <- blast_apul %>% mutate( subject_id_extracted = str_extract(subject_id, "mRNA::ptg\\d+l:\\d{4}") # Extract the required format ) ``` The subject IDs for blast are from the gff and are just using the mRNA, scaffold, and location of feature. GFF needs to be parsed to match the blast names with the gff. Read in gff ```{r} gff_apul <- read.csv("../data/RNAseq/Acropora_pulchra.gff3", sep = "", header = F, skip = 1) %>% select(!V10) colnames(gff_apul) <- c("scaffold", "source", "type", "start", "end", "score", "strand", "phase", "attr") ``` Make a column in the gff that matches the subject_id column in the blast df ```{r} # Select only mRNA gff_apul_mRNA <- gff_apul %>% filter(type == "mRNA") # Create column to match subject_id in blast df; pull out gene name information gff_apul_mRNA <- gff_apul_mRNA %>% mutate(subject_id_extracted = paste0("mRNA::", scaffold, ":", substr(start, 1, 4))) %>% # Use only the first four digits of start mutate(Parent = str_extract(attr, "(?<=Parent=)[^;]+")) # Extract Parent value ``` Join gff and blast df ```{r} blast_apul_names <- blast_apul %>% left_join(gff_apul_mRNA, by = "subject_id_extracted") %>% na.omit() # Extract the end value from subject_id (after the "-") blast_apul_names <- blast_apul_names %>% mutate(subject_id_end = as.numeric(str_extract(subject_id, "(?<=-)[0-9]+"))) # Extract the number after "-" # Keep rows where subject_id_end matches the end column blast_apul_names <- blast_apul_names %>% filter(subject_id_end == end) ``` Merge blast info with count matrix ```{r} count_blast <- blast_apul_names %>% full_join(count_matrix, by = c("Parent" = "gene_id")) %>% na.omit() ``` Manipulate data so that there is a sample column ```{r} reshaped_data_apul <- count_blast %>% # Pivot longer to stack values pivot_longer(cols = ACR12:ACR8, names_to = "sample", values_to = "value") ``` Read in phys PC data ```{r} phys_pc_apul <- read.csv("../data/01-Physiology/phys_data_pc_apul.csv") %>% mutate(fragment_ID = gsub("_5", "", fragment_ID)) ``` Join phys data with reshaped data df ```{r} count_phys_apul <- phys_pc_apul %>% full_join(reshaped_data_apul, by = c("fragment_ID" = "sample")) %>% na.omit() count_phys_apul$value <- as.numeric(count_phys_apul$value) ``` ID outliers ```{r} identify_outliers <- function(x) { Q1 <- quantile(x, 0.25) Q3 <- quantile(x, 0.75) IQR <- Q3 - Q1 lower_bound <- Q1 - 1.5 * IQR upper_bound <- Q3 + 1.5 * IQR return(x < lower_bound | x > upper_bound) } # Flag and sum outliers by group count_phys_outliers_apul <- count_phys_apul %>% group_by(query_id) %>% mutate(is_outlier = identify_outliers(value)) #%>% #summarise(total_outliers = sum(is_outlier)) # Remove outliers count_phys_no_outliers_apul <- count_phys_outliers_apul %>% filter(is_outlier == FALSE) ``` Run correlation ```{r} correlation_results_no_outliers_apul <- count_phys_no_outliers_apul %>% group_by(query_id) %>% summarize( correlation = list(cor.test(PC1, value))) %>% mutate( estimate = map_dbl(correlation, ~ .$estimate), p.value = map_dbl(correlation, ~ .$p.value) ) %>% select(query_id, estimate, p.value) %>% filter(p.value <= 0.05) %>% arrange(desc(estimate)) %>% #filter(estimate > 0) %>% mutate(query_id = factor(query_id, levels = query_id[order(estimate)])) write.csv(correlation_results_no_outliers_apul, "../output/03-EpigeneticProteins-Phys-Correlation/epi_protein_correlation_apul.csv") ``` Subset df by significant correlation and plot ```{r} sig_prot_apul <- correlation_results_no_outliers_apul$query_id # Subset by significant correlations subset_count_phys_no_outliers_apul <- count_phys_no_outliers_apul %>% filter(query_id %in% sig_prot_apul) %>% full_join(correlation_results_no_outliers_apul, by = "query_id") ggplot(subset_count_phys_no_outliers_apul, aes(x = PC1, y = value, color = query_id)) + geom_point(size = 3) + geom_smooth(method = "lm", se = FALSE, linetype = "solid", color = "black") + geom_text(data = subset_count_phys_no_outliers_apul[!duplicated(subset_count_phys_no_outliers_apul$query_id), ], aes(label = paste("Estimate:", round(estimate, 2))), x = Inf, y = Inf, hjust = 1.1, vjust = 1.5, size = 3, inherit.aes = FALSE) + #scale_color_manual(values = c("Significant" = "red", "Not Significant" = "black")) + facet_wrap(~query_id, scales = "free") + theme(legend.position = "none") ```