MACAU results - visualize and FDR correct

Ran MACAU in a Jupyter Notebook. See 05-MACAU.ipynb

In this notebook I assess patterns in methylated loci that MACAU identified as influence by oyster phenotype, aka oyster shell length.

Load libraries

list.of.packages <- c("vegan", "dplyr", "factoextra", "FactoMineR", "tidyverse", "tibble", "reshape2", "gplots", "qvalue", "cluster", "here", "gridExtra") #add new libraries here 
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# Load all libraries 
lapply(list.of.packages, FUN = function(X) {
  do.call("require", list(X)) 
})

Read in dataframe with MACAU assocation results

macau.20190812 <- read.table(file=here::here("analyses","macau","output","20190812-macau.assoc.txt"), sep = "\t", stringsAsFactors = F)
colnames(macau.20190812) <- macau.20190812[1,]
macau.20190812 <- macau.20190812[-1, ] 
macau.20190812$pvalue <- as.numeric(macau.20190812$pvalue)
hist(macau.20190812$pvalue)

# Read dataframes in if needed (object created in noteboook 04)
meth.destranded.df <- read_delim(here::here("analyses", "meth.destranded.df.txt"), delim = "\t")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   siteID = col_character(),
##   chr = col_character(),
##   strand = col_character()
## )
## See spec(...) for full column specifications.
summary(macau.20190812$id == meth.destranded.df$siteID) #confirm that macau results and original destranded dataframe are in same order 
##    Mode    TRUE 
## logical  256043
# bind macau results with df that has contig id and loci information 
macau.20190812.c <- cbind(meth.destranded.df[,1:4],macau.20190812) 
write.table(macau.20190812.c, file = here::here("analyses", "macau", "macau.20190812.c.tab"), sep = "\t", row.names = FALSE,  quote = FALSE)

Correct for multiple comparisons

From Mac’s 2019 salmon paper, doi:10.3390/genes10050356
“Multiple hypothesis testing was performed on P-values extracted from the MACAU output for each CG site using the false discovery rate (FDR) approach used in the R package qvalue [45]. We considered a CG site to be differentially methylated (i.e., differentially methylated cytosine (DMC) if it passed a 10% FDR threshold, consistent with [46].”

hist(macau.20190812$pvalue) # p-value distribution looks good (i.e. not u-shaped)

macau.qvalue <- qvalue(p = macau.20190812$pvalue, fdr.level = 0.1) # create qvalue object, setting FDR=10%
summary(macau.qvalue) 
## 
## Call:
## qvalue(p = macau.20190812$pvalue, fdr.level = 0.1)
## 
## pi0: 0.9745183   
## 
## Cumulative number of significant calls:
## 
##           <1e-04 <0.001 <0.01 <0.025 <0.05  <0.1     <1
## p-value      233    849  4226   8732 15616 28662 256028
## q-value        1      7    22     54   111   219 256043
## local FDR      0      4    12     24    50   121 113358
100*(1- macau.qvalue$pi0) #% of all loci in df as truely sign.  
## [1] 2.548171
hist(macau.qvalue)

#if one wants to estimate the false discovery rate when calling all p-values ≤ 0.01 significant: 
max(macau.qvalue$qvalues[macau.qvalue$pvalues <= 0.01])
## [1] 0.5900381
# The main purpose of the upper-left plot is to gauge the reliability of the π0 estimate, where the estimated π0 is plotted versus the tuning parameter λ. The remaining plots show how many tests are significant, as well as how many false positives to expect for each q-value cut-off.
plot(macau.qvalue)

# Given our FDR=10% level, how many are significant? 
summary(macau.qvalue$significant) #219 sign. (# TRUE)
##    Mode   FALSE    TRUE 
## logical  255824     219
# Add qvalue results to macau df 
macau.FDR <- cbind(macau.20190812, do.call(cbind.data.frame, macau.qvalue[c("qvalues", "pvalues", "significant")]))

#plot macau p-values against pvalues in the qvalue results - should all fall on 1:1 line (double check that pvalues are in same order) - 
plot(macau.FDR$pvalue, macau.FDR$pvalues) # looks good! 

Extract IDs for loci where MACAU indicates DML, FDR test (at 10%) deemed loci significantt

head(macau.sign <- subset(macau.FDR, significant=="TRUE"), n=20)
##                      id  n acpt_rate       beta   se_beta    pvalue
## 5843   Contig1054173286 18 2.926e-01  4.986e-01 1.054e-01 2.237e-06
## 9215    Contig109628199 18 2.832e-01  4.911e-01 1.216e-01 5.387e-05
## 9851   Contig1102783989 18 2.859e-01  5.282e-01 1.201e-01 1.098e-05
## 10312   Contig110906429 18 2.612e-01  3.133e-01 6.290e-02 6.351e-07
## 10342   Contig110927764 18 2.641e-01  3.686e-01 8.833e-02 3.001e-05
## 10982   Contig111772433 18 2.891e-01 -4.404e-01 1.012e-01 1.355e-05
## 11071  Contig1118734850 18 2.512e-01  6.613e-01 1.548e-01 1.929e-05
## 11956  Contig1129975372 18 2.600e-01 -3.496e-01 8.085e-02 1.534e-05
## 11970    Contig11310801 18 2.104e-01  7.822e-01 1.861e-01 2.630e-05
## 12204   Contig113403874 18 3.295e-01  3.111e-01 7.252e-02 1.790e-05
## 12337  Contig1135681836 18 3.255e-01 -7.990e-01 1.910e-01 2.886e-05
## 12379   Contig113675119 18 2.322e-01 -5.866e-01 1.423e-01 3.745e-05
## 13832   Contig117443800 18 2.785e-01  1.751e+00 3.729e-01 2.645e-06
## 15575   Contig120547984 18 3.502e-01 -2.134e-01 5.066e-02 2.512e-05
## 17436  Contig1237761289 18 2.026e-01  5.242e-01 1.198e-01 1.220e-05
## 17999  Contig1246379097 18 3.100e-01 -5.849e-01 1.290e-01 5.794e-06
## 18790   Contig126465792 18 2.044e-01 -4.076e-01 9.715e-02 2.723e-05
## 19689    Contig12795477 18 2.728e-01 -4.000e-01 9.372e-02 1.972e-05
## 20256 Contig12897313689 18 1.667e-01  8.452e-01 2.108e-01 6.096e-05
## 20265 Contig12897314453 18 1.354e-01  6.145e-01 1.490e-01 3.719e-05
##               h      se_h    sigma2 se_sigma2     alpha0 se_alpha0
## 5843  7.040e-01 2.439e-01 7.871e+00 1.225e+01 -3.424e+00 2.191e+00
## 9215  7.224e-01 2.429e-01 1.071e+01 1.657e+01 -6.054e+00 2.291e+00
## 9851  7.375e-01 2.446e-01 8.995e+00 1.343e+01 -4.223e+00 2.274e+00
## 10312 7.652e-01 2.335e-01 1.004e+01 1.669e+01 -1.510e+00 1.214e+00
## 10342 7.198e-01 2.510e-01 9.976e+00 1.544e+01 -2.992e+00 1.735e+00
## 10982 7.636e-01 2.278e-01 7.196e+00 1.415e+01  7.664e+00 1.741e+00
## 11071 7.705e-01 2.083e-01 1.942e+01 2.829e+01 -6.436e+00 2.592e+00
## 11956 7.939e-01 2.275e-01 1.590e+01 2.451e+01  5.814e+00 1.666e+00
## 11970 8.484e-01 1.318e-01 6.841e+01 4.081e+01 -6.858e+00 4.023e+00
## 12204 6.518e-01 2.676e-01 3.273e+00 8.254e+00 -2.396e+00 1.242e+00
## 12337 6.712e-01 2.875e-01 8.346e+00 1.436e+01  1.220e+01 2.810e+00
## 12379 8.326e-01 1.730e-01 1.765e+01 2.153e+01  8.424e+00 2.639e+00
## 13832 7.425e-01 2.303e-01 2.768e+01 3.254e+01 -2.082e+01 5.106e+00
## 15575 3.357e-01 1.824e-01 9.063e-01 9.810e-01  4.215e+00 8.871e-01
## 17436 8.237e-01 1.520e-01 4.058e+01 3.382e+01 -6.992e+00 2.203e+00
## 17999 6.538e-01 2.715e-01 7.166e+00 1.333e+01  9.920e+00 2.068e+00
## 18790 7.506e-01 2.290e-01 2.764e+01 2.969e+01  6.922e+00 2.019e+00
## 19689 8.110e-01 1.842e-01 1.213e+01 1.865e+01  7.272e+00 1.595e+00
## 20256 7.425e-01 1.281e-01 6.243e+01 3.000e+01 -1.616e+01 4.211e+00
## 20265 9.198e-01 7.076e-02 9.710e+01 3.172e+01 -1.494e+01 4.617e+00
##           alpha1 se_alpha1     qvalues   pvalues significant
## 5843  -1.542e+00 4.904e-01 0.015504808 2.237e-06        TRUE
## 9215  -3.527e-01 6.728e-01 0.078853710 5.387e-05        TRUE
## 9851  -1.286e+00 4.032e-01 0.038587522 1.098e-05        TRUE
## 10312 -9.761e-01 2.726e-01 0.007343786 6.351e-07        TRUE
## 10342 -1.055e+00 3.935e-01 0.058046921 3.001e-05        TRUE
## 10982  2.855e+00 5.831e-01 0.041696660 1.355e-05        TRUE
## 11071 -1.788e+00 5.418e-01 0.047955096 1.929e-05        TRUE
## 11956  2.637e+00 4.261e-01 0.044507152 1.534e-05        TRUE
## 11970 -3.085e+00 9.503e-01 0.054852502 2.630e-05        TRUE
## 12204 -5.325e-01 3.977e-01 0.047955096 1.790e-05        TRUE
## 12337  4.574e+00 1.250e+00 0.057171441 2.886e-05        TRUE
## 12379  3.795e+00 1.047e+00 0.061883914 3.745e-05        TRUE
## 13832 -2.776e+00 1.579e+00 0.017367807 2.645e-06        TRUE
## 15575  5.513e-02 2.280e-01 0.054503538 2.512e-05        TRUE
## 17436 -9.916e-01 5.088e-01 0.040218456 1.220e-05        TRUE
## 17999  3.607e+00 7.478e-01 0.026168262 5.794e-06        TRUE
## 18790  1.252e+00 4.762e-01 0.055279524 2.723e-05        TRUE
## 19689  2.528e+00 5.570e-01 0.047955096 1.972e-05        TRUE
## 20256 -1.721e+00 7.231e-01 0.085663375 6.096e-05        TRUE
## 20265 -1.414e+00 5.701e-01 0.061883914 3.719e-05        TRUE

Read in count data

Use dataframe that has the siteID, and original contig, start #, and end # info

head(meth.destranded.df <- read.table(file = here::here("analyses", "meth.destranded.df.txt"), sep = "\t", header=T, stringsAsFactors = F))
##         siteID     chr start   end strand coverage1 numCs1 numTs1
## 1 Contig038973 Contig0 38973 38973      +        15     13      2
## 2 Contig039226 Contig0 39226 39226      +        21     11     10
## 3 Contig039234 Contig0 39234 39234      +        24     10     14
## 4 Contig039252 Contig0 39252 39252      +        13      9      4
## 5 Contig041234 Contig0 41234 41234      +         4      3      1
## 6 Contig064124 Contig0 64124 64124      +         5      2      3
##   coverage2 numCs2 numTs2 coverage3 numCs3 numTs3 coverage4 numCs4 numTs4
## 1        16     15      1        16     13      3        13     10      3
## 2        22     17      5        22     10     12        23     14      9
## 3        27      7     20        27     10     17        26      5     21
## 4        11      8      3        11      6      5        12      9      3
## 5         4      3      1         7      5      2        10     10      0
## 6         8      2      6         7      4      3        10      9      1
##   coverage5 numCs5 numTs5 coverage6 numCs6 numTs6 coverage7 numCs7 numTs7
## 1        15     12      3        17     12      5        13     12      1
## 2        23      9     14        21     13      8        20      9     11
## 3        28     12     16        25     11     14        23      7     16
## 4        12     10      2        11      8      3        10      8      2
## 5         6      3      3         8      5      3         2      1      1
## 6         4      0      4         3      2      1         8      3      5
##   coverage8 numCs8 numTs8 coverage9 numCs9 numTs9 coverage10 numCs10
## 1        11      9      2        19     13      6          6       5
## 2        16     10      6        31     18     13         17       9
## 3        24      8     16        36      8     28         19       7
## 4        13      9      4        14     10      4          7       4
## 5         2      2      0        10      7      3          7       4
## 6         3      1      2        11      4      7          3       0
##   numTs10 coverage11 numCs11 numTs11 coverage12 numCs12 numTs12 coverage13
## 1       1          6       4       2         15      11       4         11
## 2       8          5       4       1         23      15       8         17
## 3      12          7       1       6         29      10      19         19
## 4       3          2       1       1         12       9       3          8
## 5       3          4       4       0          6       4       2          2
## 6       3          2       0       2          5       3       2          5
##   numCs13 numTs13 coverage14 numCs14 numTs14 coverage15 numCs15 numTs15
## 1       7       4          8       6       2          9       7       2
## 2      10       7         10       5       5         20       8      12
## 3       8      11         12       5       7         25      12      13
## 4       7       1          6       4       2          9       6       3
## 5       2       0          2       2       0          3       2       1
## 6       2       3         10       4       6         10       7       3
##   coverage16 numCs16 numTs16 coverage17 numCs17 numTs17 coverage18 numCs18
## 1          5       4       1          8       6       2         17      15
## 2         13       8       5         19       8      11         24       8
## 3         17       5      12         23      12      11         32      10
## 4          5       1       4          6       3       3         15      12
## 5          6       6       0          3       2       1          5       2
## 6          7       6       1          3       2       1          8       4
##   numTs18
## 1       2
## 2      16
## 3      22
## 4       3
## 5       3
## 6       4
#counts.tot.destrand <- read.table(file = "../analyses/counts-total-destrand.txt", header = T)
#counts.meth.destrand <- read.table(file = "../analyses/counts-meth-destrand.txt", header = T)

Merge MACAU + FDR results with raw count data

This constitutes a master dataframe

# merge macau results with count dataframe, save to file 
head(macau.sign.counts <- merge(x=macau.sign, y=subset(meth.destranded.df, siteID %in% macau.sign$id), by.x="id", by.y="siteID"))
##                 id  n acpt_rate       beta   se_beta    pvalue         h
## 1 Contig1054173286 18 2.926e-01  4.986e-01 1.054e-01 2.237e-06 7.040e-01
## 2  Contig109628199 18 2.832e-01  4.911e-01 1.216e-01 5.387e-05 7.224e-01
## 3 Contig1102783989 18 2.859e-01  5.282e-01 1.201e-01 1.098e-05 7.375e-01
## 4  Contig110906429 18 2.612e-01  3.133e-01 6.290e-02 6.351e-07 7.652e-01
## 5  Contig110927764 18 2.641e-01  3.686e-01 8.833e-02 3.001e-05 7.198e-01
## 6  Contig111772433 18 2.891e-01 -4.404e-01 1.012e-01 1.355e-05 7.636e-01
##        se_h    sigma2 se_sigma2     alpha0 se_alpha0     alpha1 se_alpha1
## 1 2.439e-01 7.871e+00 1.225e+01 -3.424e+00 2.191e+00 -1.542e+00 4.904e-01
## 2 2.429e-01 1.071e+01 1.657e+01 -6.054e+00 2.291e+00 -3.527e-01 6.728e-01
## 3 2.446e-01 8.995e+00 1.343e+01 -4.223e+00 2.274e+00 -1.286e+00 4.032e-01
## 4 2.335e-01 1.004e+01 1.669e+01 -1.510e+00 1.214e+00 -9.761e-01 2.726e-01
## 5 2.510e-01 9.976e+00 1.544e+01 -2.992e+00 1.735e+00 -1.055e+00 3.935e-01
## 6 2.278e-01 7.196e+00 1.415e+01  7.664e+00 1.741e+00  2.855e+00 5.831e-01
##       qvalues   pvalues significant          chr start   end strand
## 1 0.015504808 2.237e-06        TRUE Contig105417  3286  3286      +
## 2 0.078853710 5.387e-05        TRUE   Contig1096 28199 28199      +
## 3 0.038587522 1.098e-05        TRUE Contig110278  3989  3989      +
## 4 0.007343786 6.351e-07        TRUE Contig110906   429   429      +
## 5 0.058046921 3.001e-05        TRUE  Contig11092  7764  7764      +
## 6 0.041696660 1.355e-05        TRUE  Contig11177  2433  2433      +
##   coverage1 numCs1 numTs1 coverage2 numCs2 numTs2 coverage3 numCs3 numTs3
## 1        16     14      2        12     12      0         7      7      0
## 2        11      9      2        10      9      1         9      9      0
## 3        18     16      2        12     12      0         8      8      0
## 4        13     10      3        29     27      2        32     32      0
## 5        10      4      6        17     16      1        13     12      1
## 6        13     13      0        16     15      1        14     13      1
##   coverage4 numCs4 numTs4 coverage5 numCs5 numTs5 coverage6 numCs6 numTs6
## 1        10     10      0         6      6      0        10     10      0
## 2        10     10      0         7      7      0         5      5      0
## 3        13     13      0        13     13      0        11     11      0
## 4        31     30      1        21     21      0        16     15      1
## 5        14     14      0        18     18      0         9      9      0
## 6        16     15      1        12     10      2         8      7      1
##   coverage7 numCs7 numTs7 coverage8 numCs8 numTs8 coverage9 numCs9 numTs9
## 1         9      9      0        10      9      1        16     15      1
## 2        13     12      1         8      8      0         8      6      2
## 3        12     11      1        12     12      0        15     14      1
## 4        31     29      2        32     32      0        21     19      2
## 5        13     11      2        19     13      6        18     16      2
## 6        17     16      1        14     14      0        16     16      0
##   coverage10 numCs10 numTs10 coverage11 numCs11 numTs11 coverage12 numCs12
## 1          9       9       0          5       5       0          9       8
## 2          5       5       0          6       6       0          5       5
## 3         17      16       1          3       3       0         14      13
## 4         20      19       1         34      33       1         14      14
## 5          5       5       0          5       5       0         20      18
## 6         15      15       0         17      17       0         11      10
##   numTs12 coverage13 numCs13 numTs13 coverage14 numCs14 numTs14 coverage15
## 1       1         15      15       0         19      19       0         19
## 2       0         11      11       0         12      12       0          9
## 3       1          9       9       0         10       9       1         10
## 4       0         29      28       1         32      32       0         24
## 5       2         16      15       1         17      17       0          7
## 6       1         21      20       1         24      24       0         25
##   numCs15 numTs15 coverage16 numCs16 numTs16 coverage17 numCs17 numTs17
## 1      19       0         20      20       0         15      15       0
## 2       9       0         18      18       0         11      10       1
## 3      10       0          9       9       0         13      13       0
## 4      24       0         33      32       1         35      33       2
## 5       7       0          8       7       1         10      10       0
## 6      22       3         22      21       1         19      18       1
##   coverage18 numCs18 numTs18
## 1         14      12       2
## 2         11      11       0
## 3         16      15       1
## 4         36      33       3
## 5         16      14       2
## 6         17      17       0
write.table(macau.sign.counts, file = here::here("analyses", "combined-macau-FDR-counts.txt"), sep = "\t", row.names = FALSE,  quote = FALSE) 

Pull out separate methylation count & coverage count dfs

macau.sign.tot.counts <- macau.sign.counts[, !(grepl("num", colnames(macau.sign.counts)))]
macau.sign.meth.counts <- macau.sign.counts[, !(grepl("numT|coverage", colnames(macau.sign.counts)))]

Create df where coverage counts <5 replaced with NA

macau.sign.tot.counts.10x <- macau.sign.tot.counts %>% 
  mutate_at(vars(contains('coverage')), funs(ifelse(. < 10, NA, .)))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.

Create % methylated dataframe with sign. MACAU loci

# duplicate macau methylated count dataframe to make % methylated df for 10x coverage (all <10x cov will be "NA"")
macau.sign.perc.meth.10x <- macau.sign.meth.counts

# divide macau methylated counts by total counts 
macau.sign.perc.meth.10x[,22:39] <- 100*(macau.sign.meth.counts[,22:39]/macau.sign.tot.counts.10x[,22:39])
rownames(macau.sign.perc.meth.10x) <- macau.sign.perc.meth.10x[,1]

Heatmap of % methylated, cluster analysis order, with NA for instances that loci coverage <10x

Sample ordered from cluster analysis, where dendogram is constructed from Gower’s distance matrix (lots of NA values), and Ward cluster method.

—> mbd samples #1-9 = Hood Canal population
—> mbd samples #10-18 = South Sound population

Heatmap of % methylated, cluster analysis order.

  • Only loci with 10x coverage in color, less than 10x coverage in white.
  • Sample ordered from cluster analysis, where dendogram is constructed from correlation matrix, and Ward cluster method.
par(cex.main=.8)
heatmap.2(as.matrix(macau.sign.perc.meth.10x[22:39]), 
          main = "MACAU id'd % methylated\n10x cov\nOrdered by cluster analysis", 
          xlab = "mbd sample", ylab = "loci",  
          hclustfun = function(x) hclust(x, method="ward.D"), distfunc <- function(x) daisy(x,metric="correlation"), trace="none", dendrogram = "column", na.color = "white",
          col = colorRampPalette(c('#08519c','#bdd7e7'))(30))
## Warning in is.na(Rowv): is.na() applied to non-(list or vector) of type
## 'closure'

Heatmap of % methylated, tree order, with NA for instances that loci coverage <10x

No cluster analysis, instead samples are presented in the same order as the MethylKit tree.

tree.order <- paste("numCs",c(15,17,12,10,11,13,18,14,16,8,1,6,9,2,4,5,3,7), sep="")
par(cex.main=.8)
heatmap.2(as.matrix(macau.sign.perc.meth.10x[tree.order]), main = "MACAU id'd % methylated\ngray=<10x cov\nOrdered by MethylKit tree", 
          xlab = "mbd sample", ylab = "loci", na.color = "white",trace="none", dendrogram = "none", Rowv=FALSE, Colv = FALSE,
          col = colorRampPalette(c('#08519c','#bdd7e7'))(30))

Heatmap of % methylated, tree order, only loci with 10x coverage across 75% of samples

No cluster analysis, instead samples are presented in the same order as the MethylKit tree.

par(cex.main=.8)
heatmap.2(as.matrix(macau.sign.perc.meth.10x[!rowSums(is.na(macau.sign.perc.meth.10x)) > 0.25*18,tree.order]), 
          main = "MACAU id'd % methylation\nFDR=10%, loci cov=10x\nOrdered by MethylKit tree", 
          xlab = "mbd sample", ylab = "loci", trace="none", dendrogram = "none", Rowv=FALSE, Colv = FALSE,
          col = colorRampPalette(c('#08519c','#bdd7e7'))(30))

Heatmap of % methylated, size order, only loci with 10x coverage across all samples

No cluster analysis, instead samples are presented in the same order as the MethylKit tree.

Prepare size data to plot in same order as this heat map

This is laborious code with some room for error - needs fixing

size <- read.csv(file = here::here("data","mbd_size.csv"), header=T, sep = "\t")
key <- read.csv(file=here::here("data","sample-key.csv"))
size.macau <- cbind(x=c(rep(1,times=18)), y=merge(y=size, x=key, by.y="Sample", by.x="SAMPLE")) %>%
  mutate(population=as.factor(c(rep("HC", times=9), rep("SS",  times=9)))) %>%
  mutate(sample=paste("numCs", y.MBD.FILENAME, sep="")) %>%
  arrange(y.Length)
mycols <- c("indianred", "darkcyan")

#pdf(file = here::here("..", "analyses","macau-heatmap.pdf"))
par(cex.main=.8)
heatmap.2(as.matrix(macau.sign.perc.meth.10x[!rowSums(is.na(macau.sign.perc.meth.10x)) > 0.25*18,size.macau$sample]), 
          main = "MACAU id'd % methylation\nFDR=10%, loci cov=10x (75% of samples)\nOrdered by Length", xlab = "mbd sample", ylab = "loci", 
          trace="none", dendrogram = "none", Rowv=FALSE, Colv = FALSE,
          col = colorRampPalette(c('#08519c','#bdd7e7'))(30))

par(oma=c(0,8,0,0))
barplot(height = size.macau$y.Length,  names.arg = size.macau$sample, col = mycols[size.macau$pop], cex.names =  0.8,
        main = "Shell Length (mm)", las=2)

#dev.off()

Create BED files for MACAU id’d loci with 10x coverage for 1) any samples, 2) all samples

macau.sign.perc.meth.10x %>%
  dplyr::select(c("chr", "start", "end")) %>%
write_delim(here::here("analyses","macau","macau-any10x.bed"),  delim = '\t', col_names = FALSE)

macau.sign.perc.meth.10x[complete.cases(macau.sign.perc.meth.10x[22:39]), ] %>%
  dplyr::select(c("chr", "start", "end")) %>%
write_delim(here::here("analyses","macau", "macau-all10x.bed"),  delim = '\t', col_names = FALSE)

macau.sign.perc.meth.10x[!rowSums(is.na(macau.sign.perc.meth.10x)) > 0.25*18,] %>%
  dplyr::select(c("chr", "start", "end")) %>%
write_delim(here::here("analyses","macau", "macau-10x75perc.bed"),  delim = '\t', col_names = FALSE)

# All loci fed into/from macau 
macau.20190812.c %>%
  dplyr::select(c("chr", "start", "end")) %>%
write_delim(here::here("analyses","macau", "macau-all-loci.bed"),  delim = '\t', col_names = FALSE)

BONEYARD

PCA with % methylated df, only with loci w/ 10x coverage across all samples

source(here::here("resources","biostats.R"))

# Transpose methylated count df, remove any loci with NA values (i.e. <10x) 
macau.sign.perc.meth.10x.t <- macau.sign.perc.meth.10x[complete.cases(macau.sign.perc.meth.10x[22:39]), c(1,22:39)] %>% remove_rownames %>% gather(column, value, -id) %>% spread(id, value) %>% column_to_rownames(var="column")

# Run PCA on transposed matrix, scale=F creates variance-covariance matrix since all data is same units 
met.perc.pca <- prcomp(macau.sign.perc.meth.10x.t, scale=F) 

Inspect PCA axes, etc.

summary(met.perc.pca)
## Importance of components:
##                            PC1     PC2     PC3     PC4      PC5      PC6
## Standard deviation     43.8380 26.1830 23.0663 20.3097 17.53486 14.26078
## Proportion of Variance  0.4133  0.1474  0.1144  0.0887  0.06612  0.04373
## Cumulative Proportion   0.4133  0.5607  0.6751  0.7638  0.82994  0.87368
##                             PC7      PC8     PC9    PC10    PC11    PC12
## Standard deviation     12.26335 10.31811 9.55722 8.62860 7.87000 6.31613
## Proportion of Variance  0.03234  0.02289 0.01964 0.01601 0.01332 0.00858
## Cumulative Proportion   0.90602  0.92891 0.94855 0.96457 0.97788 0.98646
##                           PC13    PC14    PC15    PC16    PC17      PC18
## Standard deviation     5.15991 4.21017 2.95146 2.86058 1.30403 1.305e-14
## Proportion of Variance 0.00573 0.00381 0.00187 0.00176 0.00037 0.000e+00
## Cumulative Proportion  0.99219 0.99600 0.99787 0.99963 1.00000 1.000e+00
pca.eigenval(met.perc.pca) #The Proporation of Variance = %variance 
## Importance of components:
##                                 PC1         PC2         PC3         PC4
## Variance(eigenvalue)   1921.7681718 685.5474541 532.0556016 412.4831046
## Proportion of Variance    0.4132731   0.1474258   0.1144177   0.0887038
## Cumulative Proportion     0.4132731   0.5606989   0.6751166   0.7638204
## Broken-stick value        3.4951081   2.4951081   1.9951081   1.6617747
##                                PC5         PC6         PC7         PC8
## Variance(eigenvalue)   307.4713719 203.3699508 150.3898581 106.4633449
## Proportion of Variance   0.0661212   0.0437344   0.0323411   0.0228948
## Cumulative Proportion    0.8299416   0.8736760   0.9060171   0.9289119
## Broken-stick value       1.4117747   1.2117747   1.0451081   0.9022509
##                               PC9       PC10       PC11       PC12
## Variance(eigenvalue)   91.3404352 74.4527194 61.9368270 39.8934511
## Proportion of Variance  0.0196426  0.0160109  0.0133194  0.0085790
## Cumulative Proportion   0.9485545  0.9645654  0.9778848  0.9864638
## Broken-stick value      0.7772509  0.6661398  0.5661398  0.4752307
##                              PC13       PC14      PC15      PC16      PC17
## Variance(eigenvalue)   26.6246520 17.7255445 8.7111227 8.1829252 1.7004847
## Proportion of Variance  0.0057256  0.0038118 0.0018733 0.0017597 0.0003657
## Cumulative Proportion   0.9921894  0.9960013 0.9978746 0.9996343 1.0000000
## Broken-stick value      0.3918974  0.3149743 0.2435458 0.1768791 0.1143791
##                             PC18
## Variance(eigenvalue)   0.0000000
## Proportion of Variance 0.0000000
## Cumulative Proportion  1.0000000
## Broken-stick value     0.0555556
fviz_screeplot(met.perc.pca, addlabels = TRUE) #PC 1, 2 & 3 may be sign. 

ordi.monte(macau.sign.perc.meth.10x.t,ord="pca",dim=5) #only pc 1 sign. according to ordi.monte 

## Press return for next plot

## Press return for next plot

## Press return for next plot

## Press return for next plot

## Press return for next plot 
## Randomization Test of Eigenvalues:
##              PC1   PC2   PC3   PC4   PC5
## Eigenvalue 8.484 3.770 2.817 2.239 2.108
## P-value    0.000 0.206 0.941 0.999 0.941
fviz_contrib(met.perc.pca, choice = "var", axes = 1, top=50)

fviz_contrib(met.perc.pca, choice = "var", axes = 2, top=50)

fviz_contrib(met.perc.pca, choice = "var", axes = 3, top=50)

Create PCA cluster plots (not with variables)

pops <- c("HC", rep("SS", times=9), rep("HC", times=8)) #create pop vector for color-coding 

fviz_pca_biplot(met.perc.pca, label="ind", invisible = "var", col.ind=pops, axes = c(1,2), pointsize=3) + ggtitle(label="PCA % methylated, loci id'd by MACAU, PC 1x2")

fviz_pca_biplot(met.perc.pca, label="ind",  invisible = "var",col.ind=pops, axes = c(1,3), pointsize=3) + ggtitle(label="PCA % methylated, loci id'd by MACAU, PC 1x3")

fviz_pca_biplot(met.perc.pca, label="ind", invisible = "var", col.ind=pops, axes = c(2,3), pointsize=3) + ggtitle(label="PCA % methylated, loci id'd by MACAU, PC 2x3")  #dimension 2 = pop separation