list.of.packages <- c("methylKit", "tidyverse") #add new libraries here
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# Load all libraries
lapply(list.of.packages, FUN = function(X) {
do.call("require", list(X))
})
MACAU is a software program that assess the influence of a variable (or covariates) on differential methylation while controlling for relatedness. In our case, we will look at the influence of oyster length & wet weight on DML, while considering relatedness in the Hood Canal and South Sound Olympia oyster populations. Inputs required for MACAU include:
In this notebook I will generate #1 and #2 - the count files - from a methylKit object.
First, load object that Steven created in 01-methylkit.Rmd
load("../analyses/methylation-filtered/R-objects/meth_filter")
Create subset-able dataframe from methylkit object.
meth_filter.df <- getData(meth_filter)
Create a new, single column with a base site ID by merging contig, start, and strand info.
meth_filter.df$siteID <- with(meth_filter.df, paste(chr, start, sep="_")) #merge id columns into 1
ncol(meth_filter.df) #count total # columns
## [1] 59
meth_filter.df <- subset(meth_filter.df, select=c(59,1:58)) #move siteID to first column
rownames(meth_filter.df) <- NULL #remove row numbers
head(meth_filter.df) #check resulting dataframe
## siteID chr start end strand coverage1 numCs1 numTs1 coverage2
## 1 Contig0_39226 Contig0 39226 39226 + 21 11 10 22
## 2 Contig0_39234 Contig0 39234 39234 + 24 10 14 27
## 3 Contig0_64179 Contig0 64179 64179 + 10 9 1 14
## 4 Contig0_71523 Contig0 71523 71523 + 13 13 0 12
## 5 Contig0_71533 Contig0 71533 71533 + 17 17 0 17
## 6 Contig0_71542 Contig0 71542 71542 + 16 15 1 21
## numCs2 numTs2 coverage3 numCs3 numTs3 coverage4 numCs4 numTs4 coverage5
## 1 17 5 22 10 12 23 14 9 23
## 2 7 20 27 10 17 26 5 21 28
## 3 10 4 NA NA NA 18 17 1 11
## 4 12 0 11 11 0 18 16 2 10
## 5 17 0 14 13 1 17 17 0 17
## 6 21 0 12 12 0 14 14 0 13
## numCs5 numTs5 coverage6 numCs6 numTs6 coverage7 numCs7 numTs7 coverage8
## 1 9 14 21 13 8 20 9 11 16
## 2 12 16 25 11 14 23 7 16 24
## 3 8 3 13 13 0 13 5 8 12
## 4 8 2 NA NA NA 11 7 4 10
## 5 17 0 13 13 0 17 13 4 18
## 6 13 0 12 12 0 20 20 0 12
## numCs8 numTs8 coverage9 numCs9 numTs9 coverage10 numCs10 numTs10 coverage11
## 1 10 6 31 18 13 17 9 8 NA
## 2 8 16 36 8 28 19 7 12 NA
## 3 11 1 13 8 5 NA NA NA NA
## 4 10 0 16 15 1 23 19 4 18
## 5 18 0 19 19 0 22 22 0 22
## 6 11 1 13 13 0 16 16 0 19
## numCs11 numTs11 coverage12 numCs12 numTs12 coverage13 numCs13 numTs13
## 1 NA NA 23 15 8 17 10 7
## 2 NA NA 29 10 19 19 8 11
## 3 NA NA 19 14 5 11 8 3
## 4 17 1 11 11 0 26 22 4
## 5 22 0 17 15 2 30 29 1
## 6 19 0 12 12 0 33 33 0
## coverage14 numCs14 numTs14 coverage15 numCs15 numTs15 coverage16 numCs16
## 1 10 5 5 20 8 12 13 8
## 2 12 5 7 25 12 13 17 5
## 3 21 16 5 16 14 2 10 10
## 4 21 18 3 31 27 4 21 18
## 5 21 20 1 34 32 2 21 20
## 6 22 21 1 31 30 1 21 19
## numTs16 coverage17 numCs17 numTs17 coverage18 numCs18 numTs18
## 1 5 19 8 11 24 8 16
## 2 12 23 12 11 32 10 22
## 3 0 10 10 0 15 7 8
## 4 3 17 14 3 18 18 0
## 5 1 17 17 0 23 22 1
## 6 2 12 11 1 22 21 1
Create dataframe with total counts (aka coverage), and location info (chromosome, start, strand). End was not pulled, since count are of single base pairs (start and end are the same)
IMPORTANT NOTE: MACAU does not work if there are any missing (NA) or 0 values in the total count file. Because I still want to analyze the loci where one sample per population was filtered out due to <10x or >100x coverage, I need to do some merging.
# Create a low threshold object, needed to merge with the filtered dataframe so I can replace NA values with actual values
load("../analyses/myobj_18") #load original count object if needed
meth_united_alldf <- methylKit::unite(myobj_18, destrand=TRUE, min.per.group=1L) %>% getData()
meth_united_alldf$siteID <- with(meth_united_alldf, paste(chr, start, sep="_")) #merge id columns into 1
meth_united_alldf <- subset(meth_united_alldf, select=c(59,1:58)) #move siteID to first column
save(meth_united_alldf, file="../analyses/meth_united_alldf")
# Select loci from the full dataset that are only contained within the filtered dataset
meth_filter.df.noNA <- meth_united_alldf %>%
filter(siteID %in% meth_filter.df$siteID)
# See if there are any NA values left in the total count file
sapply(meth_filter.df.noNA[, grep(c("coverage"), colnames(meth_filter.df.noNA))], function(x) sum(is.na(x))) #yes, there are some
## coverage1 coverage2 coverage3 coverage4 coverage5 coverage6 coverage7
## 18 19 27 30 17 43 7
## coverage8 coverage9 coverage10 coverage11 coverage12 coverage13 coverage14
## 18 57 20 117 22 7 4
## coverage15 coverage16 coverage17 coverage18
## 54 20 52 35
head(meth_filter.df.noNA[is.na(meth_filter.df.noNA$coverage15),]) #check out the highest instance
## siteID chr start end strand coverage1 numCs1 numTs1
## 2760 Contig131341_3762 Contig131341 3762 3762 + 20 15 5
## 2761 Contig131341_3765 Contig131341 3765 3765 + 20 19 1
## 2762 Contig131341_3778 Contig131341 3778 3778 + 16 16 0
## 3829 Contig1511_3385 Contig1511 3385 3385 + 33 33 0
## 3830 Contig1511_3396 Contig1511 3396 3396 + 28 28 0
## 4080 Contig15660_93 Contig15660 93 93 + 22 22 0
## coverage2 numCs2 numTs2 coverage3 numCs3 numTs3 coverage4 numCs4 numTs4
## 2760 22 22 0 19 16 3 33 31 2
## 2761 20 18 2 22 15 7 32 25 7
## 2762 15 12 3 17 16 1 23 17 6
## 3829 15 13 2 NA NA NA 12 12 0
## 3830 11 9 2 NA NA NA 9 9 0
## 4080 13 12 1 18 18 0 33 32 1
## coverage5 numCs5 numTs5 coverage6 numCs6 numTs6 coverage7 numCs7 numTs7
## 2760 18 15 3 14 11 3 33 31 2
## 2761 18 10 8 16 5 11 31 17 14
## 2762 17 16 1 13 12 1 27 24 3
## 3829 20 20 0 23 22 1 22 22 0
## 3830 16 16 0 22 21 1 12 12 0
## 4080 19 18 1 22 21 1 31 30 1
## coverage8 numCs8 numTs8 coverage9 numCs9 numTs9 coverage10 numCs10 numTs10
## 2760 32 31 1 21 18 3 34 30 4
## 2761 35 26 9 21 18 3 36 25 11
## 2762 27 25 2 19 18 1 28 26 2
## 3829 24 23 1 20 19 1 17 16 1
## 3830 19 19 0 14 14 0 14 14 0
## 4080 18 18 0 41 40 1 44 42 2
## coverage11 numCs11 numTs11 coverage12 numCs12 numTs12 coverage13 numCs13
## 2760 25 19 6 36 34 2 30 28
## 2761 26 21 5 35 24 11 29 23
## 2762 18 18 0 29 21 8 19 17
## 3829 13 12 1 31 26 5 20 13
## 3830 11 11 0 24 22 2 18 17
## 4080 16 16 0 28 13 15 24 24
## numTs13 coverage14 numCs14 numTs14 coverage15 numCs15 numTs15 coverage16
## 2760 2 32 26 6 NA NA NA 35
## 2761 6 37 31 6 NA NA NA 28
## 2762 2 26 23 3 NA NA NA 27
## 3829 7 21 19 2 NA NA NA 39
## 3830 1 13 12 1 NA NA NA 28
## 4080 0 18 17 1 NA NA NA 14
## numCs16 numTs16 coverage17 numCs17 numTs17 coverage18 numCs18 numTs18
## 2760 30 5 28 26 2 35 35 0
## 2761 20 8 28 20 8 34 24 10
## 2762 26 1 21 21 0 28 25 3
## 3829 38 1 16 15 1 21 21 0
## 3830 28 0 12 12 0 21 20 1
## 4080 13 1 28 28 0 NA NA NA
# Extract only loci without NA values
meth_filter.df.noNA.final <- meth_filter.df.noNA[complete.cases(meth_filter.df.noNA[, grep(c("coverage"), colnames(meth_filter.df.noNA))]), ]
# Check for NAs again - should
sapply(meth_filter.df.noNA.final[, grep(c("coverage"), colnames(meth_filter.df.noNA.final))], function(x) sum(is.na(x))) #nope!
## coverage1 coverage2 coverage3 coverage4 coverage5 coverage6 coverage7
## 0 0 0 0 0 0 0
## coverage8 coverage9 coverage10 coverage11 coverage12 coverage13 coverage14
## 0 0 0 0 0 0 0
## coverage15 coverage16 coverage17 coverage18
## 0 0 0 0
# See how many loci were dropped
nrow(meth_filter.df.noNA) - nrow(meth_filter.df.noNA.final) # 454 loci dropped
## [1] 454
Create separate objects for total counts (coverage) and for methylated counts (numCs)
head(counts.tot <- meth_filter.df.noNA.final[, grep(c("siteID|coverage"), colnames(meth_filter.df.noNA.final))])
## siteID coverage1 coverage2 coverage3 coverage4 coverage5 coverage6
## 1 Contig0_39226 21 22 22 23 23 21
## 2 Contig0_39234 24 27 27 26 28 25
## 3 Contig0_64179 10 14 9 21 11 13
## 4 Contig0_71523 16 12 13 18 13 8
## 5 Contig0_71533 21 17 18 17 21 13
## 6 Contig0_71542 23 23 16 14 15 12
## coverage7 coverage8 coverage9 coverage10 coverage11 coverage12 coverage13
## 1 20 16 31 17 5 23 17
## 2 23 24 36 19 7 29 19
## 3 13 12 15 5 5 19 11
## 4 16 10 16 23 18 14 28
## 5 23 20 21 27 22 21 33
## 6 28 14 18 21 19 12 35
## coverage14 coverage15 coverage16 coverage17 coverage18
## 1 10 20 13 19 24
## 2 12 25 17 23 32
## 3 24 19 10 10 15
## 4 21 33 21 20 18
## 5 24 36 23 23 23
## 6 26 33 23 17 22
head(counts.meth <- meth_filter.df.noNA.final[, grep(c("siteID|numCs"), colnames(meth_filter.df.noNA.final))])
## siteID numCs1 numCs2 numCs3 numCs4 numCs5 numCs6 numCs7 numCs8 numCs9
## 1 Contig0_39226 11 17 10 14 9 13 9 10 18
## 2 Contig0_39234 10 7 10 5 12 11 7 8 8
## 3 Contig0_64179 9 10 8 20 8 13 5 11 9
## 4 Contig0_71523 16 12 13 16 11 8 11 10 15
## 5 Contig0_71533 21 17 17 17 21 13 17 20 20
## 6 Contig0_71542 21 22 16 14 15 12 28 13 18
## numCs10 numCs11 numCs12 numCs13 numCs14 numCs15 numCs16 numCs17 numCs18
## 1 9 4 15 10 5 8 8 8 8
## 2 7 1 10 8 5 12 5 12 10
## 3 3 4 14 8 18 17 10 10 7
## 4 19 17 13 24 18 28 18 17 18
## 5 27 22 17 32 23 33 22 23 22
## 6 21 19 12 35 25 32 21 16 21
Confirm that rows (loci) are in same order for total counts and methylated counts
table(counts.tot$siteID == counts.meth$siteID)
##
## TRUE
## 33284
nrow(counts.tot)==nrow(counts.meth)
## [1] TRUE
Write dataframes to tab delimited .txt files for MACAU.
write.table(counts.tot, file = "../analyses/macau/counts-filtered-total.txt", sep = "\t", row.names = FALSE, quote = FALSE)
write.table(counts.meth, file = "../analyses/macau/counts-filtered-meth.txt", sep = "\t", row.names = FALSE, quote = FALSE)
save(counts.tot, file="../analyses/macau/R-objects/counts.tot")
Write out dataframe with merged ID, contig #, start #, and strand info for later use
write.table(meth_filter.df.noNA.final, file = "../analyses/macau/meth_filter.df.txt", sep = "\t", row.names = FALSE, quote = FALSE)
I need the predictor variable and relatedness matrix to be in the same order as the count files.
Read in size data (predictor variable), and sample ID key.
print(size <- read.csv(file = "../data/mbd_size.csv", header=T, sep = "\t"))
## Sample Wet.weight Length
## 1 hc1_2 2.2 17.41
## 2 hc1_4 1.9 20.43
## 3 hc2_15 2.2 25.33
## 4 hc2_17 1.1 19.38
## 5 hc3_1 2.2 26.79
## 6 hc3_5 1.9 19.50
## 7 hc3_7 1.4 18.43
## 8 hc3_10 1.2 19.80
## 9 hc3_11 2.1 20.54
## 10 ss2_9 1.5 17.20
## 11 ss2_14 2.2 21.02
## 12 ss2_18 5.3 35.76
## 13 ss3_3 2.1 22.71
## 14 ss3_14 2.4 24.71
## 15 ss3_15 2.6 26.78
## 16 ss3_16 4.2 39.57
## 17 ss3_20 2.3 26.96
## 18 ss5_18 5.4 27.34
print(key <- read.csv("../data/sample-key.csv"))
## MBD.FILENAME SAMPLE
## 1 10 ss2_9
## 2 11 ss2_14
## 3 12 ss2_18
## 4 13 ss3_3
## 5 14 ss3_14
## 6 15 ss3_15
## 7 16 ss3_16
## 8 17 ss3_20
## 9 18 ss5_18
## 10 1 hc1_2
## 11 2 hc1_4
## 12 3 hc2_15
## 13 4 hc2_17
## 14 5 hc3_1
## 15 6 hc3_5
## 16 7 hc3_7
## 17 8 hc3_10
## 18 9 hc3_11
Merge size and sample key, making sure that the size data is ordered from sample #1 to #18. Include a column of 1’s - these are needed for the MACAU covariate file as it’s used as the intercept.
print(size.macau <- cbind(x=c(rep(1,times=18)), y=merge(y=size, x=key, by.y="Sample", by.x="SAMPLE")))
## x y.SAMPLE y.MBD.FILENAME y.Wet.weight y.Length
## 1 1 hc1_2 1 2.2 17.41
## 2 1 hc1_4 2 1.9 20.43
## 3 1 hc2_15 3 2.2 25.33
## 4 1 hc2_17 4 1.1 19.38
## 5 1 hc3_1 5 2.2 26.79
## 6 1 hc3_10 8 1.2 19.80
## 7 1 hc3_11 9 2.1 20.54
## 8 1 hc3_5 6 1.9 19.50
## 9 1 hc3_7 7 1.4 18.43
## 10 1 ss2_14 11 2.2 21.02
## 11 1 ss2_18 12 5.3 35.76
## 12 1 ss2_9 10 1.5 17.20
## 13 1 ss3_14 14 2.4 24.71
## 14 1 ss3_15 15 2.6 26.78
## 15 1 ss3_16 16 4.2 39.57
## 16 1 ss3_20 17 2.3 26.96
## 17 1 ss3_3 13 2.1 22.71
## 18 1 ss5_18 18 5.4 27.34
Save 3 text files for MACAU. Don’t include rownames or column names.
- predictors.size.macau.txt: contains both weight and length measurements so I can use either in MACAU as predictor. - cov.weight.macau.txt: contains column of 1’s (intercept) and weights to use as covariate (when length is predictor).
- cov.length.macau.txt: contains column of 1’s (intercept) and lengths to use as covariate (when weight is predictor).
write.table(size.macau[c(4:5)], file="../analyses/macau/predictors.size.macau.txt", sep="\t", row.names = FALSE, col.names = FALSE)
write.table(size.macau[c(1,4)], file="../analyses/macau/cov.weight.macau.txt", sep="\t", row.names = FALSE, col.names = FALSE)
write.table(size.macau[c(1,5)], file="../analyses/macau/cov.length.macau.txt", sep="\t", row.names = FALSE, col.names = FALSE)
I found out that I can use dummy/binary predictor variables, and run MACAU to identify loci associated with population while controlling for relatedness. Here I prep a .txt file with numeric predictors 1 (Hood Canal) and 2 (South Sound)
pop.macau <- size.macau[-4:-5] %>%
mutate(population=c(rep(1, times=9), rep(2, times=9)))
write.table(pop.macau[c(2:4)], file="../analyses/macau/predictors.pop.macau.txt", sep="\t", row.names = FALSE, col.names = FALSE)