Correlation between the epigenetic protein counts from the gene count matrix and the PC1 (calculated in 01-Physiology.Rmd).

Read in gene count matrix.

count_matrix <- read.csv("../data/RNAseq/gene_count_matrix_filtered.csv") %>%
  rename("gene_id" = "X")
head(count_matrix)
##     gene_id  C17  C18  C19  C20  C21  C22  C23  C24  C25  C26  C27  C28  C29
## 1 Pver_g289 1290  935  942 1027  594 1595 1066 1506 1760 1148 1290 1561  959
## 2 Pver_g288 1320 1326  828 1189  901 1571 1098 1595 1819 1286 1373 1655 1212
## 3 Pver_g285 1691 1597 1251 1661 1023 1728 1550 1650 2311 1771 1908 2304  978
## 4 Pver_g284 1876 2487 1787 1867 1356 1969 2288 1348 2206 1583 2044 2125 1465
## 5 Pver_g287 4242 3788 3151 4740 3100 3546 2998 3445 6343 4503 5864 3978 2290
## 6 Pver_g286 2980 3313 3445 3043  961 1945 1776 2910 3799 3642 3246 3426 2459
##    C30  C31  C32  E10  E11  E12  E13  E14  E15  E16   E1   E2   E3   E4   E5
## 1 1609  858 1792 2689  708 1941 1544 1048 1326 1172 1070 1322 1223  895  686
## 2 2026 1088 1870 2059  928 1853 1163 1255 1415 1566  882 1161  970  904  574
## 3 1643 1411 1710 2174 1101 2085 2124 1319 2084 2051 1048 1716  737  855  591
## 4 1731 1511 2322 2728  954 2096 2787 1439 1294 2345 1358 1778 1194 1136  903
## 5 4638 3667 4560 5424 2867 5546 4591 4048 4928 4833 3425 4450 4080 3301 1915
## 6 1904 2990 3590 3342 2237 2827 3608 2473 2561 3265 4407 2104 3686 2165 3634
##     E6   E7   E8   E9
## 1  692  930 1387 1044
## 2  634 1108 1040 1575
## 3  607 1236 1215 1800
## 4  905 1412 1413 2095
## 5 2496 3217 3603 3565
## 6 3402 2958 2399 1669
# Metadata for RNAseq
rna_meta <- read.csv("../data/RNAseq/metadata.RNAseq.csv")

The RNA and physiology samples do not have the same name, so the RNAseq metadata will be needed downstream.

Read in blast results for Pverr against epigenetic proteins of interest

blast <- read.delim("../output/02-Pver-blast/Mach-blastp-Pver_out.tab", sep = "", header = F)
colnames(blast) <- c("query_id", "subject_id", "percent_identity", "alignment_length", 
                     "mismatches", "gap_openings", "q_start", "q_end", "s_start", 
                     "s_end", "e_value", "bit_score")
head(blast)
##     query_id     subject_id percent_identity alignment_length mismatches
## 1  Dnmt1-201   Pver_g950.t1           55.055             1464        582
## 2 Dnmt3a-201  Pver_g3152.t1           46.609              693        317
## 3 Dnmt3b-201  Pver_g3152.t1           43.733              718        324
## 4 Dnmt3l-201 Pver_g14219.t1           60.346             2542        933
## 5  USP9Y-201 Pver_g14219.t1           59.713             2512        943
## 6  USP10-201 Pver_g13975.t2           44.669              347        150
##   gap_openings q_start q_end s_start s_end  e_value bit_score
## 1           23     185  1598     139  1576 0.00e+00      1550
## 2            9     246   907      33   703 0.00e+00       622
## 3           12     181   860      29   704 0.00e+00       597
## 4           26     453  2971       1  2490 0.00e+00      3108
## 5           25      30  2520      27  2490 0.00e+00      3035
## 6           13     338   664     822  1146 4.29e-71       254

The gene names in the count matrix and the blast dfs are not the same; this is due to naming errors made by the authors of the Pverr genome paper. They made a supplementary file that has both naming iterations, so this will be used to make sure the gene names are the same in each df

Read in file with gene name iterations

names <- read_excel("../data/RNAseq/FileS2_Pver_gene_annot_May28.xlsx", skip = 4) %>%
  select(Query, Gene)
names$Gene <- gsub("_gene", "", names$Gene)

Join names df with the blast df

blast_names <- blast %>%
  full_join(names, by = c("subject_id" = "Query"))

Join blast_names df with count matrix

count_blast <- blast_names %>%
  full_join(count_matrix, by = c("Gene" = "gene_id"))
str(count_blast)
## 'data.frame':    27587 obs. of  45 variables:
##  $ query_id        : chr  "Dnmt1-201" "Dnmt3a-201" "Dnmt3b-201" "Dnmt3l-201" ...
##  $ subject_id      : chr  "Pver_g950.t1" "Pver_g3152.t1" "Pver_g3152.t1" "Pver_g14219.t1" ...
##  $ percent_identity: num  55.1 46.6 43.7 60.3 59.7 ...
##  $ alignment_length: int  1464 693 718 2542 2512 347 492 369 492 521 ...
##  $ mismatches      : int  582 317 324 933 943 150 223 78 168 167 ...
##  $ gap_openings    : int  23 9 12 26 25 13 8 3 4 5 ...
##  $ q_start         : int  185 246 181 453 30 338 62 16 1 6 ...
##  $ q_end           : int  1598 907 860 2971 2520 664 539 369 487 526 ...
##  $ s_start         : int  139 33 29 1 27 822 2 1 1 14 ...
##  $ s_end           : int  1576 703 704 2490 2490 1146 451 367 482 458 ...
##  $ e_value         : num  0 0 0 0 0 ...
##  $ bit_score       : num  1550 622 597 3108 3035 ...
##  $ Gene            : chr  "Pver_g950" "Pver_g3152" "Pver_g3152" "Pver_g14219" ...
##  $ C17             : int  854 2510 2510 9550 9550 5940 2191 3258 2913 2191 ...
##  $ C18             : int  544 1794 1794 6991 6991 4884 1691 2955 2349 1691 ...
##  $ C19             : int  761 2166 2166 7975 7975 5079 1636 2145 1940 1636 ...
##  $ C20             : int  588 1423 1423 6445 6445 6488 1557 2162 2403 1557 ...
##  $ C21             : int  340 1193 1193 4572 4572 3733 971 1800 1466 971 ...
##  $ C22             : int  754 2376 2376 7859 7859 5978 1730 4275 2609 1730 ...
##  $ C23             : int  506 1877 1877 5708 5708 7302 1394 2048 2020 1394 ...
##  $ C24             : int  731 1903 1903 8630 8630 6384 2012 3597 2748 2012 ...
##  $ C25             : int  909 3152 3152 11414 11414 7948 2743 4296 3370 2743 ...
##  $ C26             : int  716 2331 2331 8128 8128 5083 1679 3253 2293 1679 ...
##  $ C27             : int  1126 2469 2469 9519 9519 9738 1982 3108 2343 1982 ...
##  $ C28             : int  931 2716 2716 9664 9664 7557 2131 4222 2803 2131 ...
##  $ C29             : int  600 1815 1815 6306 6306 3456 1304 3042 1930 1304 ...
##  $ C30             : int  674 2626 2626 8019 8019 5410 2502 3677 3366 2502 ...
##  $ C31             : int  598 2033 2033 7975 7975 5189 2075 3040 2140 2075 ...
##  $ C32             : int  907 2262 2262 8385 8385 6494 1724 3354 2594 1724 ...
##  $ E10             : int  721 3454 3454 9911 9911 7398 3105 4690 3631 3105 ...
##  $ E11             : int  553 1848 1848 5997 5997 8968 1139 1496 1752 1139 ...
##  $ E12             : int  705 3964 3964 10631 10631 5893 2446 4202 3960 2446 ...
##  $ E13             : int  759 2792 2792 9973 9973 5355 2041 3045 2775 2041 ...
##  $ E14             : int  703 1899 1899 6410 6410 7177 1671 2890 2076 1671 ...
##  $ E15             : int  646 1959 1959 7656 7656 5367 1462 2986 2103 1462 ...
##  $ E16             : int  756 2972 2972 11094 11094 8040 2216 3723 2714 2216 ...
##  $ E1              : int  609 3122 3122 8392 8392 5519 1879 2988 2198 1879 ...
##  $ E2              : int  829 2494 2494 9779 9779 6694 1943 3118 2134 1943 ...
##  $ E3              : int  621 2233 2233 7925 7925 6261 1574 2506 1934 1574 ...
##  $ E4              : int  459 1514 1514 5967 5967 4790 1314 2064 1567 1314 ...
##  $ E5              : int  549 1651 1651 6198 6198 3124 913 1508 1279 913 ...
##  $ E6              : int  566 2697 2697 7577 7577 4939 1592 2524 1763 1592 ...
##  $ E7              : int  504 2138 2138 8172 8172 5338 1808 3066 2383 1808 ...
##  $ E8              : int  592 1905 1905 6741 6741 3940 1255 2872 2013 1255 ...
##  $ E9              : int  780 2495 2495 9094 9094 5893 1745 3003 2977 1745 ...

Manipulate data so that there is a sample column

reshaped_data <- count_blast %>%
  # Pivot longer to stack values
  pivot_longer(cols = C17:E9, 
               names_to = "sample", 
               values_to = "value")

Join with RNA metadata

reshaped_data <- reshaped_data %>%
  full_join(rna_meta, by = c("sample" = "sample_id"))

Read in phys PC data

phys_pc <- read.csv("../data/01-Physiology/phys_data_pc.csv")

Join phys data with reshaped data df

count_phys <- phys_pc %>%
  full_join(reshaped_data, by = "fragment.ID") %>%
  na.omit()

count_phys$value <- as.numeric(count_phys$value)

Run correlation

correlation_results <- count_phys %>%
  group_by(query_id) %>%
  summarize(
    correlation = list(cor.test(PC1, value))) %>%
  mutate(
    estimate = map_dbl(correlation, ~ .$estimate),
    p.value = map_dbl(correlation, ~ .$p.value)
  ) %>%
  select(query_id, estimate, p.value) %>%
  filter(p.value <= 0.05) %>%
  arrange(desc(estimate)) %>%
  mutate(query_id = factor(query_id, levels = query_id[order(estimate)]))


head(correlation_results)
## # A tibble: 6 × 3
##   query_id    estimate p.value
##   <fct>          <dbl>   <dbl>
## 1 Setmar-201     0.479 0.00557
## 2 Mbd4-201       0.425 0.0153 
## 3 PUS10-201      0.384 0.0299 
## 4 Prmt2-203      0.375 0.0342 
## 5 RNF8-202       0.354 0.0469 
## 6 PPP1R3G-201   -0.370 0.0372
write.csv(correlation_results, "../output/03-EpigeneticProteins-Phys-Correlation/epi_protein_correlation.csv")

Plot correlation heatmap with the x axis being epigenetic proteins. Only plotting the correlations that have pvalue <= 0.05.

ggplot(correlation_results, aes(x = query_id, y = 1, fill = estimate)) +
  geom_tile(color = "white") +  # Create the tiles
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +  # Color gradient
  geom_text(aes(label = round(estimate, 2)), color = "black") +  # Add text labels
  labs(title = "p≤0.05",
       x = "Epigenetic Protein",
       y = "") +  # y-axis label empty for a single row
  theme_minimal() +  # Use a minimal theme
  coord_flip() +  # Flip the coordinates so that the Query IDs are on the y-axis
  theme(axis.text.y = element_text(size = 8, color = "black"),  # Display and customize y-axis labels (Query IDs)
        axis.ticks.y = element_blank(),  # Hide y-axis ticks
        axis.text.x = element_blank(),
        axis.title = element_text(size = 10),
        panel.grid.major = element_blank(),  # Remove major grid lines
        panel.grid.minor = element_blank(),  # Remove minor grid lines
        legend.title = element_text(size = 10),  # Customize legend title size
        legend.text = element_text(size = 8))  # Customize legend text size