Ran MACAU in a Jupyter Notebook. See 05-MACAU.ipynb
In this notebook I assess patterns in methylated loci that MACAU identified as influence by oyster phenotype, aka oyster shell length.
list.of.packages <- c("vegan", "dplyr", "factoextra", "FactoMineR", "tidyverse", "tibble", "reshape2", "gplots", "qvalue", "cluster", "here", "gridExtra") #add new libraries here
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# Load all libraries
lapply(list.of.packages, FUN = function(X) {
do.call("require", list(X))
})
macau.20190812 <- read.table(file=here::here("analyses","macau","output","20190812-macau.assoc.txt"), sep = "\t", stringsAsFactors = F)
colnames(macau.20190812) <- macau.20190812[1,]
macau.20190812 <- macau.20190812[-1, ]
macau.20190812$pvalue <- as.numeric(macau.20190812$pvalue)
hist(macau.20190812$pvalue)
# Read dataframes in if needed (object created in noteboook 04)
meth.destranded.df <- read_delim(here::here("analyses", "meth.destranded.df.txt"), delim = "\t")
## Parsed with column specification:
## cols(
## .default = col_double(),
## siteID = col_character(),
## chr = col_character(),
## strand = col_character()
## )
## See spec(...) for full column specifications.
summary(macau.20190812$id == meth.destranded.df$siteID) #confirm that macau results and original destranded dataframe are in same order
## Mode TRUE
## logical 256043
# bind macau results with df that has contig id and loci information
macau.20190812.c <- cbind(meth.destranded.df[,1:4],macau.20190812)
write.table(macau.20190812.c, file = here::here("analyses", "macau", "macau.20190812.c.tab"), sep = "\t", row.names = FALSE, quote = FALSE)
From Mac’s 2019 salmon paper, doi:10.3390/genes10050356
“Multiple hypothesis testing was performed on P-values extracted from the MACAU output for each CG site using the false discovery rate (FDR) approach used in the R package qvalue [45]. We considered a CG site to be differentially methylated (i.e., differentially methylated cytosine (DMC) if it passed a 10% FDR threshold, consistent with [46].”
hist(macau.20190812$pvalue) # p-value distribution looks good (i.e. not u-shaped)
macau.qvalue <- qvalue(p = macau.20190812$pvalue, fdr.level = 0.1) # create qvalue object, setting FDR=10%
summary(macau.qvalue)
##
## Call:
## qvalue(p = macau.20190812$pvalue, fdr.level = 0.1)
##
## pi0: 0.9745183
##
## Cumulative number of significant calls:
##
## <1e-04 <0.001 <0.01 <0.025 <0.05 <0.1 <1
## p-value 233 849 4226 8732 15616 28662 256028
## q-value 1 7 22 54 111 219 256043
## local FDR 0 4 12 24 50 121 113358
100*(1- macau.qvalue$pi0) #% of all loci in df as truely sign.
## [1] 2.548171
hist(macau.qvalue)
#if one wants to estimate the false discovery rate when calling all p-values ≤ 0.01 significant:
max(macau.qvalue$qvalues[macau.qvalue$pvalues <= 0.01])
## [1] 0.5900381
# The main purpose of the upper-left plot is to gauge the reliability of the π0 estimate, where the estimated π0 is plotted versus the tuning parameter λ. The remaining plots show how many tests are significant, as well as how many false positives to expect for each q-value cut-off.
plot(macau.qvalue)
# Given our FDR=10% level, how many are significant?
summary(macau.qvalue$significant) #219 sign. (# TRUE)
## Mode FALSE TRUE
## logical 255824 219
# Add qvalue results to macau df
macau.FDR <- cbind(macau.20190812, do.call(cbind.data.frame, macau.qvalue[c("qvalues", "pvalues", "significant")]))
#plot macau p-values against pvalues in the qvalue results - should all fall on 1:1 line (double check that pvalues are in same order) -
plot(macau.FDR$pvalue, macau.FDR$pvalues) # looks good!
head(macau.sign <- subset(macau.FDR, significant=="TRUE"), n=20)
## id n acpt_rate beta se_beta pvalue
## 5843 Contig1054173286 18 2.926e-01 4.986e-01 1.054e-01 2.237e-06
## 9215 Contig109628199 18 2.832e-01 4.911e-01 1.216e-01 5.387e-05
## 9851 Contig1102783989 18 2.859e-01 5.282e-01 1.201e-01 1.098e-05
## 10312 Contig110906429 18 2.612e-01 3.133e-01 6.290e-02 6.351e-07
## 10342 Contig110927764 18 2.641e-01 3.686e-01 8.833e-02 3.001e-05
## 10982 Contig111772433 18 2.891e-01 -4.404e-01 1.012e-01 1.355e-05
## 11071 Contig1118734850 18 2.512e-01 6.613e-01 1.548e-01 1.929e-05
## 11956 Contig1129975372 18 2.600e-01 -3.496e-01 8.085e-02 1.534e-05
## 11970 Contig11310801 18 2.104e-01 7.822e-01 1.861e-01 2.630e-05
## 12204 Contig113403874 18 3.295e-01 3.111e-01 7.252e-02 1.790e-05
## 12337 Contig1135681836 18 3.255e-01 -7.990e-01 1.910e-01 2.886e-05
## 12379 Contig113675119 18 2.322e-01 -5.866e-01 1.423e-01 3.745e-05
## 13832 Contig117443800 18 2.785e-01 1.751e+00 3.729e-01 2.645e-06
## 15575 Contig120547984 18 3.502e-01 -2.134e-01 5.066e-02 2.512e-05
## 17436 Contig1237761289 18 2.026e-01 5.242e-01 1.198e-01 1.220e-05
## 17999 Contig1246379097 18 3.100e-01 -5.849e-01 1.290e-01 5.794e-06
## 18790 Contig126465792 18 2.044e-01 -4.076e-01 9.715e-02 2.723e-05
## 19689 Contig12795477 18 2.728e-01 -4.000e-01 9.372e-02 1.972e-05
## 20256 Contig12897313689 18 1.667e-01 8.452e-01 2.108e-01 6.096e-05
## 20265 Contig12897314453 18 1.354e-01 6.145e-01 1.490e-01 3.719e-05
## h se_h sigma2 se_sigma2 alpha0 se_alpha0
## 5843 7.040e-01 2.439e-01 7.871e+00 1.225e+01 -3.424e+00 2.191e+00
## 9215 7.224e-01 2.429e-01 1.071e+01 1.657e+01 -6.054e+00 2.291e+00
## 9851 7.375e-01 2.446e-01 8.995e+00 1.343e+01 -4.223e+00 2.274e+00
## 10312 7.652e-01 2.335e-01 1.004e+01 1.669e+01 -1.510e+00 1.214e+00
## 10342 7.198e-01 2.510e-01 9.976e+00 1.544e+01 -2.992e+00 1.735e+00
## 10982 7.636e-01 2.278e-01 7.196e+00 1.415e+01 7.664e+00 1.741e+00
## 11071 7.705e-01 2.083e-01 1.942e+01 2.829e+01 -6.436e+00 2.592e+00
## 11956 7.939e-01 2.275e-01 1.590e+01 2.451e+01 5.814e+00 1.666e+00
## 11970 8.484e-01 1.318e-01 6.841e+01 4.081e+01 -6.858e+00 4.023e+00
## 12204 6.518e-01 2.676e-01 3.273e+00 8.254e+00 -2.396e+00 1.242e+00
## 12337 6.712e-01 2.875e-01 8.346e+00 1.436e+01 1.220e+01 2.810e+00
## 12379 8.326e-01 1.730e-01 1.765e+01 2.153e+01 8.424e+00 2.639e+00
## 13832 7.425e-01 2.303e-01 2.768e+01 3.254e+01 -2.082e+01 5.106e+00
## 15575 3.357e-01 1.824e-01 9.063e-01 9.810e-01 4.215e+00 8.871e-01
## 17436 8.237e-01 1.520e-01 4.058e+01 3.382e+01 -6.992e+00 2.203e+00
## 17999 6.538e-01 2.715e-01 7.166e+00 1.333e+01 9.920e+00 2.068e+00
## 18790 7.506e-01 2.290e-01 2.764e+01 2.969e+01 6.922e+00 2.019e+00
## 19689 8.110e-01 1.842e-01 1.213e+01 1.865e+01 7.272e+00 1.595e+00
## 20256 7.425e-01 1.281e-01 6.243e+01 3.000e+01 -1.616e+01 4.211e+00
## 20265 9.198e-01 7.076e-02 9.710e+01 3.172e+01 -1.494e+01 4.617e+00
## alpha1 se_alpha1 qvalues pvalues significant
## 5843 -1.542e+00 4.904e-01 0.015504808 2.237e-06 TRUE
## 9215 -3.527e-01 6.728e-01 0.078853710 5.387e-05 TRUE
## 9851 -1.286e+00 4.032e-01 0.038587522 1.098e-05 TRUE
## 10312 -9.761e-01 2.726e-01 0.007343786 6.351e-07 TRUE
## 10342 -1.055e+00 3.935e-01 0.058046921 3.001e-05 TRUE
## 10982 2.855e+00 5.831e-01 0.041696660 1.355e-05 TRUE
## 11071 -1.788e+00 5.418e-01 0.047955096 1.929e-05 TRUE
## 11956 2.637e+00 4.261e-01 0.044507152 1.534e-05 TRUE
## 11970 -3.085e+00 9.503e-01 0.054852502 2.630e-05 TRUE
## 12204 -5.325e-01 3.977e-01 0.047955096 1.790e-05 TRUE
## 12337 4.574e+00 1.250e+00 0.057171441 2.886e-05 TRUE
## 12379 3.795e+00 1.047e+00 0.061883914 3.745e-05 TRUE
## 13832 -2.776e+00 1.579e+00 0.017367807 2.645e-06 TRUE
## 15575 5.513e-02 2.280e-01 0.054503538 2.512e-05 TRUE
## 17436 -9.916e-01 5.088e-01 0.040218456 1.220e-05 TRUE
## 17999 3.607e+00 7.478e-01 0.026168262 5.794e-06 TRUE
## 18790 1.252e+00 4.762e-01 0.055279524 2.723e-05 TRUE
## 19689 2.528e+00 5.570e-01 0.047955096 1.972e-05 TRUE
## 20256 -1.721e+00 7.231e-01 0.085663375 6.096e-05 TRUE
## 20265 -1.414e+00 5.701e-01 0.061883914 3.719e-05 TRUE
Use dataframe that has the siteID, and original contig, start #, and end # info
head(meth.destranded.df <- read.table(file = here::here("analyses", "meth.destranded.df.txt"), sep = "\t", header=T, stringsAsFactors = F))
## siteID chr start end strand coverage1 numCs1 numTs1
## 1 Contig038973 Contig0 38973 38973 + 15 13 2
## 2 Contig039226 Contig0 39226 39226 + 21 11 10
## 3 Contig039234 Contig0 39234 39234 + 24 10 14
## 4 Contig039252 Contig0 39252 39252 + 13 9 4
## 5 Contig041234 Contig0 41234 41234 + 4 3 1
## 6 Contig064124 Contig0 64124 64124 + 5 2 3
## coverage2 numCs2 numTs2 coverage3 numCs3 numTs3 coverage4 numCs4 numTs4
## 1 16 15 1 16 13 3 13 10 3
## 2 22 17 5 22 10 12 23 14 9
## 3 27 7 20 27 10 17 26 5 21
## 4 11 8 3 11 6 5 12 9 3
## 5 4 3 1 7 5 2 10 10 0
## 6 8 2 6 7 4 3 10 9 1
## coverage5 numCs5 numTs5 coverage6 numCs6 numTs6 coverage7 numCs7 numTs7
## 1 15 12 3 17 12 5 13 12 1
## 2 23 9 14 21 13 8 20 9 11
## 3 28 12 16 25 11 14 23 7 16
## 4 12 10 2 11 8 3 10 8 2
## 5 6 3 3 8 5 3 2 1 1
## 6 4 0 4 3 2 1 8 3 5
## coverage8 numCs8 numTs8 coverage9 numCs9 numTs9 coverage10 numCs10
## 1 11 9 2 19 13 6 6 5
## 2 16 10 6 31 18 13 17 9
## 3 24 8 16 36 8 28 19 7
## 4 13 9 4 14 10 4 7 4
## 5 2 2 0 10 7 3 7 4
## 6 3 1 2 11 4 7 3 0
## numTs10 coverage11 numCs11 numTs11 coverage12 numCs12 numTs12 coverage13
## 1 1 6 4 2 15 11 4 11
## 2 8 5 4 1 23 15 8 17
## 3 12 7 1 6 29 10 19 19
## 4 3 2 1 1 12 9 3 8
## 5 3 4 4 0 6 4 2 2
## 6 3 2 0 2 5 3 2 5
## numCs13 numTs13 coverage14 numCs14 numTs14 coverage15 numCs15 numTs15
## 1 7 4 8 6 2 9 7 2
## 2 10 7 10 5 5 20 8 12
## 3 8 11 12 5 7 25 12 13
## 4 7 1 6 4 2 9 6 3
## 5 2 0 2 2 0 3 2 1
## 6 2 3 10 4 6 10 7 3
## coverage16 numCs16 numTs16 coverage17 numCs17 numTs17 coverage18 numCs18
## 1 5 4 1 8 6 2 17 15
## 2 13 8 5 19 8 11 24 8
## 3 17 5 12 23 12 11 32 10
## 4 5 1 4 6 3 3 15 12
## 5 6 6 0 3 2 1 5 2
## 6 7 6 1 3 2 1 8 4
## numTs18
## 1 2
## 2 16
## 3 22
## 4 3
## 5 3
## 6 4
#counts.tot.destrand <- read.table(file = "../analyses/counts-total-destrand.txt", header = T)
#counts.meth.destrand <- read.table(file = "../analyses/counts-meth-destrand.txt", header = T)
This constitutes a master dataframe
# merge macau results with count dataframe, save to file
head(macau.sign.counts <- merge(x=macau.sign, y=subset(meth.destranded.df, siteID %in% macau.sign$id), by.x="id", by.y="siteID"))
## id n acpt_rate beta se_beta pvalue h
## 1 Contig1054173286 18 2.926e-01 4.986e-01 1.054e-01 2.237e-06 7.040e-01
## 2 Contig109628199 18 2.832e-01 4.911e-01 1.216e-01 5.387e-05 7.224e-01
## 3 Contig1102783989 18 2.859e-01 5.282e-01 1.201e-01 1.098e-05 7.375e-01
## 4 Contig110906429 18 2.612e-01 3.133e-01 6.290e-02 6.351e-07 7.652e-01
## 5 Contig110927764 18 2.641e-01 3.686e-01 8.833e-02 3.001e-05 7.198e-01
## 6 Contig111772433 18 2.891e-01 -4.404e-01 1.012e-01 1.355e-05 7.636e-01
## se_h sigma2 se_sigma2 alpha0 se_alpha0 alpha1 se_alpha1
## 1 2.439e-01 7.871e+00 1.225e+01 -3.424e+00 2.191e+00 -1.542e+00 4.904e-01
## 2 2.429e-01 1.071e+01 1.657e+01 -6.054e+00 2.291e+00 -3.527e-01 6.728e-01
## 3 2.446e-01 8.995e+00 1.343e+01 -4.223e+00 2.274e+00 -1.286e+00 4.032e-01
## 4 2.335e-01 1.004e+01 1.669e+01 -1.510e+00 1.214e+00 -9.761e-01 2.726e-01
## 5 2.510e-01 9.976e+00 1.544e+01 -2.992e+00 1.735e+00 -1.055e+00 3.935e-01
## 6 2.278e-01 7.196e+00 1.415e+01 7.664e+00 1.741e+00 2.855e+00 5.831e-01
## qvalues pvalues significant chr start end strand
## 1 0.015504808 2.237e-06 TRUE Contig105417 3286 3286 +
## 2 0.078853710 5.387e-05 TRUE Contig1096 28199 28199 +
## 3 0.038587522 1.098e-05 TRUE Contig110278 3989 3989 +
## 4 0.007343786 6.351e-07 TRUE Contig110906 429 429 +
## 5 0.058046921 3.001e-05 TRUE Contig11092 7764 7764 +
## 6 0.041696660 1.355e-05 TRUE Contig11177 2433 2433 +
## coverage1 numCs1 numTs1 coverage2 numCs2 numTs2 coverage3 numCs3 numTs3
## 1 16 14 2 12 12 0 7 7 0
## 2 11 9 2 10 9 1 9 9 0
## 3 18 16 2 12 12 0 8 8 0
## 4 13 10 3 29 27 2 32 32 0
## 5 10 4 6 17 16 1 13 12 1
## 6 13 13 0 16 15 1 14 13 1
## coverage4 numCs4 numTs4 coverage5 numCs5 numTs5 coverage6 numCs6 numTs6
## 1 10 10 0 6 6 0 10 10 0
## 2 10 10 0 7 7 0 5 5 0
## 3 13 13 0 13 13 0 11 11 0
## 4 31 30 1 21 21 0 16 15 1
## 5 14 14 0 18 18 0 9 9 0
## 6 16 15 1 12 10 2 8 7 1
## coverage7 numCs7 numTs7 coverage8 numCs8 numTs8 coverage9 numCs9 numTs9
## 1 9 9 0 10 9 1 16 15 1
## 2 13 12 1 8 8 0 8 6 2
## 3 12 11 1 12 12 0 15 14 1
## 4 31 29 2 32 32 0 21 19 2
## 5 13 11 2 19 13 6 18 16 2
## 6 17 16 1 14 14 0 16 16 0
## coverage10 numCs10 numTs10 coverage11 numCs11 numTs11 coverage12 numCs12
## 1 9 9 0 5 5 0 9 8
## 2 5 5 0 6 6 0 5 5
## 3 17 16 1 3 3 0 14 13
## 4 20 19 1 34 33 1 14 14
## 5 5 5 0 5 5 0 20 18
## 6 15 15 0 17 17 0 11 10
## numTs12 coverage13 numCs13 numTs13 coverage14 numCs14 numTs14 coverage15
## 1 1 15 15 0 19 19 0 19
## 2 0 11 11 0 12 12 0 9
## 3 1 9 9 0 10 9 1 10
## 4 0 29 28 1 32 32 0 24
## 5 2 16 15 1 17 17 0 7
## 6 1 21 20 1 24 24 0 25
## numCs15 numTs15 coverage16 numCs16 numTs16 coverage17 numCs17 numTs17
## 1 19 0 20 20 0 15 15 0
## 2 9 0 18 18 0 11 10 1
## 3 10 0 9 9 0 13 13 0
## 4 24 0 33 32 1 35 33 2
## 5 7 0 8 7 1 10 10 0
## 6 22 3 22 21 1 19 18 1
## coverage18 numCs18 numTs18
## 1 14 12 2
## 2 11 11 0
## 3 16 15 1
## 4 36 33 3
## 5 16 14 2
## 6 17 17 0
write.table(macau.sign.counts, file = here::here("analyses", "combined-macau-FDR-counts.txt"), sep = "\t", row.names = FALSE, quote = FALSE)
macau.sign.tot.counts <- macau.sign.counts[, !(grepl("num", colnames(macau.sign.counts)))]
macau.sign.meth.counts <- macau.sign.counts[, !(grepl("numT|coverage", colnames(macau.sign.counts)))]
macau.sign.tot.counts.10x <- macau.sign.tot.counts %>%
mutate_at(vars(contains('coverage')), funs(ifelse(. < 10, NA, .)))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
# duplicate macau methylated count dataframe to make % methylated df for 10x coverage (all <10x cov will be "NA"")
macau.sign.perc.meth.10x <- macau.sign.meth.counts
# divide macau methylated counts by total counts
macau.sign.perc.meth.10x[,22:39] <- 100*(macau.sign.meth.counts[,22:39]/macau.sign.tot.counts.10x[,22:39])
rownames(macau.sign.perc.meth.10x) <- macau.sign.perc.meth.10x[,1]
Sample ordered from cluster analysis, where dendogram is constructed from Gower’s distance matrix (lots of NA values), and Ward cluster method.
—> mbd samples #1-9 = Hood Canal population
—> mbd samples #10-18 = South Sound population
par(cex.main=.8)
heatmap.2(as.matrix(macau.sign.perc.meth.10x[22:39]),
main = "MACAU id'd % methylated\n10x cov\nOrdered by cluster analysis",
xlab = "mbd sample", ylab = "loci",
hclustfun = function(x) hclust(x, method="ward.D"), distfunc <- function(x) daisy(x,metric="correlation"), trace="none", dendrogram = "column", na.color = "white",
col = colorRampPalette(c('#08519c','#bdd7e7'))(30))
## Warning in is.na(Rowv): is.na() applied to non-(list or vector) of type
## 'closure'
No cluster analysis, instead samples are presented in the same order as the MethylKit tree.
tree.order <- paste("numCs",c(15,17,12,10,11,13,18,14,16,8,1,6,9,2,4,5,3,7), sep="")
par(cex.main=.8)
heatmap.2(as.matrix(macau.sign.perc.meth.10x[tree.order]), main = "MACAU id'd % methylated\ngray=<10x cov\nOrdered by MethylKit tree",
xlab = "mbd sample", ylab = "loci", na.color = "white",trace="none", dendrogram = "none", Rowv=FALSE, Colv = FALSE,
col = colorRampPalette(c('#08519c','#bdd7e7'))(30))
No cluster analysis, instead samples are presented in the same order as the MethylKit tree.
par(cex.main=.8)
heatmap.2(as.matrix(macau.sign.perc.meth.10x[!rowSums(is.na(macau.sign.perc.meth.10x)) > 0.25*18,tree.order]),
main = "MACAU id'd % methylation\nFDR=10%, loci cov=10x\nOrdered by MethylKit tree",
xlab = "mbd sample", ylab = "loci", trace="none", dendrogram = "none", Rowv=FALSE, Colv = FALSE,
col = colorRampPalette(c('#08519c','#bdd7e7'))(30))
No cluster analysis, instead samples are presented in the same order as the MethylKit tree.
This is laborious code with some room for error - needs fixing
size <- read.csv(file = here::here("data","mbd_size.csv"), header=T, sep = "\t")
key <- read.csv(file=here::here("data","sample-key.csv"))
size.macau <- cbind(x=c(rep(1,times=18)), y=merge(y=size, x=key, by.y="Sample", by.x="SAMPLE")) %>%
mutate(population=as.factor(c(rep("HC", times=9), rep("SS", times=9)))) %>%
mutate(sample=paste("numCs", y.MBD.FILENAME, sep="")) %>%
arrange(y.Length)
mycols <- c("indianred", "darkcyan")
#pdf(file = here::here("..", "analyses","macau-heatmap.pdf"))
par(cex.main=.8)
heatmap.2(as.matrix(macau.sign.perc.meth.10x[!rowSums(is.na(macau.sign.perc.meth.10x)) > 0.25*18,size.macau$sample]),
main = "MACAU id'd % methylation\nFDR=10%, loci cov=10x (75% of samples)\nOrdered by Length", xlab = "mbd sample", ylab = "loci",
trace="none", dendrogram = "none", Rowv=FALSE, Colv = FALSE,
col = colorRampPalette(c('#08519c','#bdd7e7'))(30))
par(oma=c(0,8,0,0))
barplot(height = size.macau$y.Length, names.arg = size.macau$sample, col = mycols[size.macau$pop], cex.names = 0.8,
main = "Shell Length (mm)", las=2)
#dev.off()
macau.sign.perc.meth.10x %>%
dplyr::select(c("chr", "start", "end")) %>%
write_delim(here::here("analyses","macau","macau-any10x.bed"), delim = '\t', col_names = FALSE)
macau.sign.perc.meth.10x[complete.cases(macau.sign.perc.meth.10x[22:39]), ] %>%
dplyr::select(c("chr", "start", "end")) %>%
write_delim(here::here("analyses","macau", "macau-all10x.bed"), delim = '\t', col_names = FALSE)
macau.sign.perc.meth.10x[!rowSums(is.na(macau.sign.perc.meth.10x)) > 0.25*18,] %>%
dplyr::select(c("chr", "start", "end")) %>%
write_delim(here::here("analyses","macau", "macau-10x75perc.bed"), delim = '\t', col_names = FALSE)
# All loci fed into/from macau
macau.20190812.c %>%
dplyr::select(c("chr", "start", "end")) %>%
write_delim(here::here("analyses","macau", "macau-all-loci.bed"), delim = '\t', col_names = FALSE)
source(here::here("resources","biostats.R"))
# Transpose methylated count df, remove any loci with NA values (i.e. <10x)
macau.sign.perc.meth.10x.t <- macau.sign.perc.meth.10x[complete.cases(macau.sign.perc.meth.10x[22:39]), c(1,22:39)] %>% remove_rownames %>% gather(column, value, -id) %>% spread(id, value) %>% column_to_rownames(var="column")
# Run PCA on transposed matrix, scale=F creates variance-covariance matrix since all data is same units
met.perc.pca <- prcomp(macau.sign.perc.meth.10x.t, scale=F)
summary(met.perc.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 43.8380 26.1830 23.0663 20.3097 17.53486 14.26078
## Proportion of Variance 0.4133 0.1474 0.1144 0.0887 0.06612 0.04373
## Cumulative Proportion 0.4133 0.5607 0.6751 0.7638 0.82994 0.87368
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 12.26335 10.31811 9.55722 8.62860 7.87000 6.31613
## Proportion of Variance 0.03234 0.02289 0.01964 0.01601 0.01332 0.00858
## Cumulative Proportion 0.90602 0.92891 0.94855 0.96457 0.97788 0.98646
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 5.15991 4.21017 2.95146 2.86058 1.30403 1.305e-14
## Proportion of Variance 0.00573 0.00381 0.00187 0.00176 0.00037 0.000e+00
## Cumulative Proportion 0.99219 0.99600 0.99787 0.99963 1.00000 1.000e+00
pca.eigenval(met.perc.pca) #The Proporation of Variance = %variance
## Importance of components:
## PC1 PC2 PC3 PC4
## Variance(eigenvalue) 1921.7681718 685.5474541 532.0556016 412.4831046
## Proportion of Variance 0.4132731 0.1474258 0.1144177 0.0887038
## Cumulative Proportion 0.4132731 0.5606989 0.6751166 0.7638204
## Broken-stick value 3.4951081 2.4951081 1.9951081 1.6617747
## PC5 PC6 PC7 PC8
## Variance(eigenvalue) 307.4713719 203.3699508 150.3898581 106.4633449
## Proportion of Variance 0.0661212 0.0437344 0.0323411 0.0228948
## Cumulative Proportion 0.8299416 0.8736760 0.9060171 0.9289119
## Broken-stick value 1.4117747 1.2117747 1.0451081 0.9022509
## PC9 PC10 PC11 PC12
## Variance(eigenvalue) 91.3404352 74.4527194 61.9368270 39.8934511
## Proportion of Variance 0.0196426 0.0160109 0.0133194 0.0085790
## Cumulative Proportion 0.9485545 0.9645654 0.9778848 0.9864638
## Broken-stick value 0.7772509 0.6661398 0.5661398 0.4752307
## PC13 PC14 PC15 PC16 PC17
## Variance(eigenvalue) 26.6246520 17.7255445 8.7111227 8.1829252 1.7004847
## Proportion of Variance 0.0057256 0.0038118 0.0018733 0.0017597 0.0003657
## Cumulative Proportion 0.9921894 0.9960013 0.9978746 0.9996343 1.0000000
## Broken-stick value 0.3918974 0.3149743 0.2435458 0.1768791 0.1143791
## PC18
## Variance(eigenvalue) 0.0000000
## Proportion of Variance 0.0000000
## Cumulative Proportion 1.0000000
## Broken-stick value 0.0555556
fviz_screeplot(met.perc.pca, addlabels = TRUE) #PC 1, 2 & 3 may be sign.
ordi.monte(macau.sign.perc.meth.10x.t,ord="pca",dim=5) #only pc 1 sign. according to ordi.monte
## Press return for next plot
## Press return for next plot
## Press return for next plot
## Press return for next plot
## Press return for next plot
## Randomization Test of Eigenvalues:
## PC1 PC2 PC3 PC4 PC5
## Eigenvalue 8.484 3.770 2.817 2.239 2.108
## P-value 0.000 0.206 0.941 0.999 0.941
fviz_contrib(met.perc.pca, choice = "var", axes = 1, top=50)
fviz_contrib(met.perc.pca, choice = "var", axes = 2, top=50)
fviz_contrib(met.perc.pca, choice = "var", axes = 3, top=50)
pops <- c("HC", rep("SS", times=9), rep("HC", times=8)) #create pop vector for color-coding
fviz_pca_biplot(met.perc.pca, label="ind", invisible = "var", col.ind=pops, axes = c(1,2), pointsize=3) + ggtitle(label="PCA % methylated, loci id'd by MACAU, PC 1x2")
fviz_pca_biplot(met.perc.pca, label="ind", invisible = "var",col.ind=pops, axes = c(1,3), pointsize=3) + ggtitle(label="PCA % methylated, loci id'd by MACAU, PC 1x3")
fviz_pca_biplot(met.perc.pca, label="ind", invisible = "var", col.ind=pops, axes = c(2,3), pointsize=3) + ggtitle(label="PCA % methylated, loci id'd by MACAU, PC 2x3") #dimension 2 = pop separation