Correlation between the epigenetic protein counts from the gene count matrix and the PC1 (calculated in 01-Physiology.Rmd).
Read in gene count matrix.
count_matrix <- read.csv("../data/RNAseq/gene_count_matrix_filtered.csv") %>%
rename("gene_id" = "X")
head(count_matrix)
## gene_id C17 C18 C19 C20 C21 C22 C23 C24 C25 C26 C27 C28 C29
## 1 Pver_g289 1290 935 942 1027 594 1595 1066 1506 1760 1148 1290 1561 959
## 2 Pver_g288 1320 1326 828 1189 901 1571 1098 1595 1819 1286 1373 1655 1212
## 3 Pver_g285 1691 1597 1251 1661 1023 1728 1550 1650 2311 1771 1908 2304 978
## 4 Pver_g284 1876 2487 1787 1867 1356 1969 2288 1348 2206 1583 2044 2125 1465
## 5 Pver_g287 4242 3788 3151 4740 3100 3546 2998 3445 6343 4503 5864 3978 2290
## 6 Pver_g286 2980 3313 3445 3043 961 1945 1776 2910 3799 3642 3246 3426 2459
## C30 C31 C32 E10 E11 E12 E13 E14 E15 E16 E1 E2 E3 E4 E5
## 1 1609 858 1792 2689 708 1941 1544 1048 1326 1172 1070 1322 1223 895 686
## 2 2026 1088 1870 2059 928 1853 1163 1255 1415 1566 882 1161 970 904 574
## 3 1643 1411 1710 2174 1101 2085 2124 1319 2084 2051 1048 1716 737 855 591
## 4 1731 1511 2322 2728 954 2096 2787 1439 1294 2345 1358 1778 1194 1136 903
## 5 4638 3667 4560 5424 2867 5546 4591 4048 4928 4833 3425 4450 4080 3301 1915
## 6 1904 2990 3590 3342 2237 2827 3608 2473 2561 3265 4407 2104 3686 2165 3634
## E6 E7 E8 E9
## 1 692 930 1387 1044
## 2 634 1108 1040 1575
## 3 607 1236 1215 1800
## 4 905 1412 1413 2095
## 5 2496 3217 3603 3565
## 6 3402 2958 2399 1669
# Metadata for RNAseq
rna_meta <- read.csv("../data/RNAseq/metadata.RNAseq.csv")
The RNA and physiology samples do not have the same name, so the RNAseq metadata will be needed downstream.
Read in blast results for Pverr against epigenetic proteins of interest
blast <- read.delim("../output/02-Pver-blast/Mach-blastp-Pver_out.tab", sep = "", header = F)
colnames(blast) <- c("query_id", "subject_id", "percent_identity", "alignment_length",
"mismatches", "gap_openings", "q_start", "q_end", "s_start",
"s_end", "e_value", "bit_score")
head(blast)
## query_id subject_id percent_identity alignment_length mismatches
## 1 Dnmt1-201 Pver_g950.t1 55.055 1464 582
## 2 Dnmt3a-201 Pver_g3152.t1 46.609 693 317
## 3 Dnmt3b-201 Pver_g3152.t1 43.733 718 324
## 4 Dnmt3l-201 Pver_g14219.t1 60.346 2542 933
## 5 USP9Y-201 Pver_g14219.t1 59.713 2512 943
## 6 USP10-201 Pver_g13975.t2 44.669 347 150
## gap_openings q_start q_end s_start s_end e_value bit_score
## 1 23 185 1598 139 1576 0.00e+00 1550
## 2 9 246 907 33 703 0.00e+00 622
## 3 12 181 860 29 704 0.00e+00 597
## 4 26 453 2971 1 2490 0.00e+00 3108
## 5 25 30 2520 27 2490 0.00e+00 3035
## 6 13 338 664 822 1146 4.29e-71 254
The gene names in the count matrix and the blast dfs are not the same; this is due to naming errors made by the authors of the Pverr genome paper. They made a supplementary file that has both naming iterations, so this will be used to make sure the gene names are the same in each df
Read in file with gene name iterations
names <- read_excel("../data/RNAseq/FileS2_Pver_gene_annot_May28.xlsx", skip = 4) %>%
select(Query, Gene)
names$Gene <- gsub("_gene", "", names$Gene)
Join names df with the blast df
blast_names <- blast %>%
full_join(names, by = c("subject_id" = "Query"))
Join blast_names df with count matrix
count_blast <- blast_names %>%
full_join(count_matrix, by = c("Gene" = "gene_id"))
str(count_blast)
## 'data.frame': 27587 obs. of 45 variables:
## $ query_id : chr "Dnmt1-201" "Dnmt3a-201" "Dnmt3b-201" "Dnmt3l-201" ...
## $ subject_id : chr "Pver_g950.t1" "Pver_g3152.t1" "Pver_g3152.t1" "Pver_g14219.t1" ...
## $ percent_identity: num 55.1 46.6 43.7 60.3 59.7 ...
## $ alignment_length: int 1464 693 718 2542 2512 347 492 369 492 521 ...
## $ mismatches : int 582 317 324 933 943 150 223 78 168 167 ...
## $ gap_openings : int 23 9 12 26 25 13 8 3 4 5 ...
## $ q_start : int 185 246 181 453 30 338 62 16 1 6 ...
## $ q_end : int 1598 907 860 2971 2520 664 539 369 487 526 ...
## $ s_start : int 139 33 29 1 27 822 2 1 1 14 ...
## $ s_end : int 1576 703 704 2490 2490 1146 451 367 482 458 ...
## $ e_value : num 0 0 0 0 0 ...
## $ bit_score : num 1550 622 597 3108 3035 ...
## $ Gene : chr "Pver_g950" "Pver_g3152" "Pver_g3152" "Pver_g14219" ...
## $ C17 : int 854 2510 2510 9550 9550 5940 2191 3258 2913 2191 ...
## $ C18 : int 544 1794 1794 6991 6991 4884 1691 2955 2349 1691 ...
## $ C19 : int 761 2166 2166 7975 7975 5079 1636 2145 1940 1636 ...
## $ C20 : int 588 1423 1423 6445 6445 6488 1557 2162 2403 1557 ...
## $ C21 : int 340 1193 1193 4572 4572 3733 971 1800 1466 971 ...
## $ C22 : int 754 2376 2376 7859 7859 5978 1730 4275 2609 1730 ...
## $ C23 : int 506 1877 1877 5708 5708 7302 1394 2048 2020 1394 ...
## $ C24 : int 731 1903 1903 8630 8630 6384 2012 3597 2748 2012 ...
## $ C25 : int 909 3152 3152 11414 11414 7948 2743 4296 3370 2743 ...
## $ C26 : int 716 2331 2331 8128 8128 5083 1679 3253 2293 1679 ...
## $ C27 : int 1126 2469 2469 9519 9519 9738 1982 3108 2343 1982 ...
## $ C28 : int 931 2716 2716 9664 9664 7557 2131 4222 2803 2131 ...
## $ C29 : int 600 1815 1815 6306 6306 3456 1304 3042 1930 1304 ...
## $ C30 : int 674 2626 2626 8019 8019 5410 2502 3677 3366 2502 ...
## $ C31 : int 598 2033 2033 7975 7975 5189 2075 3040 2140 2075 ...
## $ C32 : int 907 2262 2262 8385 8385 6494 1724 3354 2594 1724 ...
## $ E10 : int 721 3454 3454 9911 9911 7398 3105 4690 3631 3105 ...
## $ E11 : int 553 1848 1848 5997 5997 8968 1139 1496 1752 1139 ...
## $ E12 : int 705 3964 3964 10631 10631 5893 2446 4202 3960 2446 ...
## $ E13 : int 759 2792 2792 9973 9973 5355 2041 3045 2775 2041 ...
## $ E14 : int 703 1899 1899 6410 6410 7177 1671 2890 2076 1671 ...
## $ E15 : int 646 1959 1959 7656 7656 5367 1462 2986 2103 1462 ...
## $ E16 : int 756 2972 2972 11094 11094 8040 2216 3723 2714 2216 ...
## $ E1 : int 609 3122 3122 8392 8392 5519 1879 2988 2198 1879 ...
## $ E2 : int 829 2494 2494 9779 9779 6694 1943 3118 2134 1943 ...
## $ E3 : int 621 2233 2233 7925 7925 6261 1574 2506 1934 1574 ...
## $ E4 : int 459 1514 1514 5967 5967 4790 1314 2064 1567 1314 ...
## $ E5 : int 549 1651 1651 6198 6198 3124 913 1508 1279 913 ...
## $ E6 : int 566 2697 2697 7577 7577 4939 1592 2524 1763 1592 ...
## $ E7 : int 504 2138 2138 8172 8172 5338 1808 3066 2383 1808 ...
## $ E8 : int 592 1905 1905 6741 6741 3940 1255 2872 2013 1255 ...
## $ E9 : int 780 2495 2495 9094 9094 5893 1745 3003 2977 1745 ...
Manipulate data so that there is a sample column
reshaped_data <- count_blast %>%
# Pivot longer to stack values
pivot_longer(cols = C17:E9,
names_to = "sample",
values_to = "value")
Join with RNA metadata
reshaped_data <- reshaped_data %>%
full_join(rna_meta, by = c("sample" = "sample_id"))
Read in phys PC data
phys_pc <- read.csv("../data/01-Physiology/phys_data_pc.csv")
Join phys data with reshaped data df
count_phys <- phys_pc %>%
full_join(reshaped_data, by = "fragment.ID") %>%
na.omit()
count_phys$value <- as.numeric(count_phys$value)
Run correlation
correlation_results <- count_phys %>%
group_by(query_id) %>%
summarize(
correlation = list(cor.test(PC1, value))) %>%
mutate(
estimate = map_dbl(correlation, ~ .$estimate),
p.value = map_dbl(correlation, ~ .$p.value)
) %>%
select(query_id, estimate, p.value) %>%
filter(p.value <= 0.05) %>%
arrange(desc(estimate)) %>%
mutate(query_id = factor(query_id, levels = query_id[order(estimate)]))
head(correlation_results)
## # A tibble: 6 × 3
## query_id estimate p.value
## <fct> <dbl> <dbl>
## 1 Setmar-201 0.479 0.00557
## 2 Mbd4-201 0.425 0.0153
## 3 PUS10-201 0.384 0.0299
## 4 Prmt2-203 0.375 0.0342
## 5 RNF8-202 0.354 0.0469
## 6 PPP1R3G-201 -0.370 0.0372
write.csv(correlation_results, "../output/03-EpigeneticProteins-Phys-Correlation/epi_protein_correlation.csv")
Plot correlation heatmap with the x axis being epigenetic proteins. Only plotting the correlations that have pvalue <= 0.05.
ggplot(correlation_results, aes(x = query_id, y = 1, fill = estimate)) +
geom_tile(color = "white") + # Create the tiles
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) + # Color gradient
geom_text(aes(label = round(estimate, 2)), color = "black") + # Add text labels
labs(title = "p≤0.05",
x = "Epigenetic Protein",
y = "") + # y-axis label empty for a single row
theme_minimal() + # Use a minimal theme
coord_flip() + # Flip the coordinates so that the Query IDs are on the y-axis
theme(axis.text.y = element_text(size = 8, color = "black"), # Display and customize y-axis labels (Query IDs)
axis.ticks.y = element_blank(), # Hide y-axis ticks
axis.text.x = element_blank(),
axis.title = element_text(size = 10),
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
legend.title = element_text(size = 10), # Customize legend title size
legend.text = element_text(size = 8)) # Customize legend text size