Load libraries

list.of.packages <- c("tidyverse", "reshape2", "here", "Pstat") #add new libraries here 

new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# Load all libraries 
lapply(list.of.packages, FUN = function(X) {
  do.call("require", list(X)) 
})
#analyses/2bRAD/HCSS_Afilt32m70_01_pp90_m75_BSouts.vcf

Calculate methylation Pst for genome bins

1kb Genome Bins

Load filtered methylation data object with sample info which was created in the notebook “01-methylkit”

load(here::here("analyses", "methylation-filtered", "R-objects", "meth_filter_reshaped")) 
head(meth_filter_reshaped) #note, this object was used to create the "meth_filter_long.tab" file, so same content  
##       chr start   end strand sample coverage numCs numTs  percMeth population
## 1 Contig0 39226 39226      +      1       21    11    10  52.38095         HC
## 2 Contig0 39234 39234      +      1       24    10    14  41.66667         HC
## 3 Contig0 64179 64179      +      1       10     9     1  90.00000         HC
## 4 Contig0 71523 71523      +      1       13    13     0 100.00000         HC
## 5 Contig0 71533 71533      +      1       17    17     0 100.00000         HC
## 6 Contig0 71542 71542      +      1       16    15     1  93.75000         HC

Note: the file Katherine sent didn’t have columns separated by tabs, but by single-spaces. I tried without succes to convert spaces to tabs - the closest I got was the below script, but sed wouldn’t recognize \t as a tab. So, I just opened the file in the program Text Edit, did find/replace, and saved as a new file to the genome-features folder in the repo.

sed -e 's/ /\t/g' "../analyses/2bRAD/HCSS_sfsm70_10kb.bed" > "../genome-features/Olurida_v081_bins-10kb.bed"

Use intersectBed to find where methylatd loci and 1kb genome bins intersect

wb: Print all lines in the second file a: input data file, which was created in previous code chunk b: save annotated gene list

intersectBed \
  -wb \
  -a "../analyses/methylation-filtered/meth_filter_long.tab" \
  -b "../genome-features/Olurida_v081_bins-1kb.bed" \
  > "../analyses/methylation-filtered/meth_filter_1kb-bin.tab"

Count various things

wc -l "../analyses/methylation-filtered/meth_filter_long.tab"
wc -l "../analyses/methylation-filtered/meth_filter_1kb-bin.tab" 
wc -l "../genome-features/Olurida_v081_bins-1kb.bed"
##   607284 ../analyses/methylation-filtered/meth_filter_long.tab
##     4104 ../analyses/methylation-filtered/meth_filter_1kb-bin.tab
##     3883 ../genome-features/Olurida_v081_bins-1kb.bed

Check out format of resulting gene regions

head -n 5  "../analyses/methylation-filtered/meth_filter_1kb-bin.tab" 
## Contig10074  5759    5759    +   1   14  14  0   100 HC  Contig10074 5000    6000
## Contig10074  5773    5773    +   1   12  12  0   100 HC  Contig10074 5000    6000
## Contig100771 1673    1673    +   1   14  13  1   92.85714285714286   HC  Contig100771    1000    1852
## Contig126977 2743    2743    +   1   11  11  0   100 HC  Contig126977    2000    3000
## Contig126977 2842    2842    +   1   NA  NA  NA  NA  HC  Contig126977    2000    3000
meth_filter_bins_1kb <- read_delim(file = "../analyses/methylation-filtered/meth_filter_1kb-bin.tab", delim = "\t", col_names = c(colnames(meth_filter_reshaped), "contig", "bin.start", "bin.end")) %>% 
  mutate(bin=paste(contig, bin.start, bin.end, sep="_"))
## Parsed with column specification:
## cols(
##   chr = col_character(),
##   start = col_double(),
##   end = col_double(),
##   strand = col_character(),
##   sample = col_double(),
##   coverage = col_double(),
##   numCs = col_double(),
##   numTs = col_double(),
##   percMeth = col_double(),
##   population = col_character(),
##   contig = col_character(),
##   bin.start = col_double(),
##   bin.end = col_double()
## )

Calculate Pst for all bins represented in our methylation data

First, how many 1kb bins do we have methylation data for?

meth_filter_bins_1kb %>% 
       distinct(bin) %>%
  nrow()
## [1] 71

I need the % methylation data for each of the gene regions, so I’ll first create that dataframe.

perc_meth_bins_1kb <- meth_filter_bins_1kb %>% 
   group_by(population, sample, bin, contig, bin.start, bin.end) %>%
   dplyr::summarise(
    mean_percMeth = mean(percMeth, na.rm=TRUE),
    sd_percMeth=sd(percMeth, na.rm=TRUE),
    n())  

# check a couple loci to make sure % methylation is calculated separately for each sample and gene region 
perc_meth_bins_1kb %>% filter(bin=="Contig10074_5000_6000") %>%
  ggplot(aes(x=sample, y=mean_percMeth)) + geom_bar(stat="identity")
## Warning: Removed 1 rows containing missing values (position_stack).

perc_meth_bins_1kb %>% filter(bin=="Contig45217_3000_4000") %>%
  ggplot(aes(x=sample, y=mean_percMeth)) + geom_bar(stat="identity")
## Warning: Removed 2 rows containing missing values (position_stack).

# Reshape data. Need to have one row per sample, one column with the population, and separate columns with each gene region with % methylation. 
perc_meth_bins_1kb_wide <- perc_meth_bins_1kb %>% 
  ungroup() %>%
  #tidyr::unite("gene_region", c("contig_gene", "start_gene_2kb", "end_gene_2kb"), sep = "_", remove = FALSE) %>%
  dplyr::select(population, sample, bin, mean_percMeth) %>%
  spread(bin, mean_percMeth) %>%
  tibble::column_to_rownames(var = "sample")

head(perc_meth_bins_1kb_wide[1:4]) #confirm correct format 
##   population Contig10074_5000_6000 Contig100771_1000_1852
## 1         HC             100.00000               92.85714
## 2         HC              96.66667              100.00000
## 3         HC                   NaN               80.00000
## 4         HC             100.00000               83.33333
## 5         HC             100.00000              100.00000
## 6         HC              91.98718                    NaN
##   Contig126977_2000_3000
## 1              100.00000
## 2               96.15385
## 3               86.36364
## 4               87.50000
## 5               95.45455
## 6               90.00000
ncol(perc_meth_bins_1kb_wide)-1 # 71 bins
## [1] 71
perc_meth_bins_1kb$`n()` %>% hist()

perc_meth_bins_1kb$`n()` %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   3.211   4.000  14.000

Run Pst

#Now run the following line and it will provide Pst estimates for every gene.
perc_meth_bins_1kb_Pst <- Pst(perc_meth_bins_1kb_wide, ci=1) #ci=1 to include 95% confidence interval calc 
## [1] "Populations sizes are:"
## HC SS 
##  9  9
# Check out Pst distribution 
hist(perc_meth_bins_1kb_Pst$Pst_Values)

summary(perc_meth_bins_1kb_Pst$Pst_Values)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.001323 0.081716 0.249733 0.278556 0.428745 0.809516
nrow(perc_meth_bins_1kb_Pst)
## [1] 71
# format of dataframe that I will save 
perc_meth_bins_1kb_Pst %>% separate(Quant_Varia, into=c("contig","start.bin", "end.bin"), sep = "_", remove = FALSE) %>% noquote() %>% View()

# Write out Pst results 
write.table(perc_meth_bins_1kb_Pst %>% 
              separate(Quant_Varia, into=c("contig","start.bin", "end.bin"), sep = "_", remove = FALSE), 
            file = here::here("analyses", "methylation-filtered", "Pst_bins_1kb.tab"), 
            sep = '\t', na = "NA", row.names = FALSE, col.names = TRUE, quote=FALSE)

10kb bins

intersectBed \
  -wb \
  -a "../analyses/methylation-filtered/meth_filter_long.tab" \
  -b "../genome-features/Olurida_v081_bins-10kb.bed" \
  > "../analyses/methylation-filtered/meth_filter_10kb-bin.tab"
meth_filter_bins_10kb <- read_delim(file = "../analyses/methylation-filtered/meth_filter_10kb-bin.tab", delim = "\t", col_names = c(colnames(meth_filter_reshaped), "contig", "bin.start", "bin.end")) %>% 
  mutate(bin=paste(contig, bin.start, bin.end, sep="_"))
## Parsed with column specification:
## cols(
##   chr = col_character(),
##   start = col_double(),
##   end = col_double(),
##   strand = col_character(),
##   sample = col_double(),
##   coverage = col_double(),
##   numCs = col_double(),
##   numTs = col_double(),
##   percMeth = col_double(),
##   population = col_character(),
##   contig = col_character(),
##   bin.start = col_double(),
##   bin.end = col_double()
## )
meth_filter_bins_10kb %>% 
       distinct(bin) %>%
  nrow() #271 10kb bins 
## [1] 271
perc_meth_bins_10kb <- meth_filter_bins_10kb %>% 
   group_by(population, sample, bin, contig, bin.start, bin.end) %>%
   dplyr::summarise(
    mean_percMeth = mean(percMeth, na.rm=TRUE),
    sd_percMeth=sd(percMeth, na.rm=TRUE),
    n())  

# check a couple loci to make sure % methylation is calculated separately for each sample and gene region 
perc_meth_bins_10kb %>% filter(bin=="Contig10074_0_6621") %>%
  ggplot(aes(x=sample, y=mean_percMeth)) + geom_bar(stat="identity")
## Warning: Removed 1 rows containing missing values (position_stack).

perc_meth_bins_10kb %>% filter(bin=="Contig19869_0_10000") %>%
  ggplot(aes(x=sample, y=mean_percMeth)) + geom_bar(stat="identity")

# Reshape data. Need to have one row per sample, one column with the population, and separate columns with each gene region with % methylation. 
perc_meth_bins_10kb_wide <- perc_meth_bins_10kb %>% 
  ungroup() %>%
  #tidyr::unite("gene_region", c("contig_gene", "start_gene_2kb", "end_gene_2kb"), sep = "_", remove = FALSE) %>%
  dplyr::select(population, sample, bin, mean_percMeth) %>%
  spread(bin, mean_percMeth) %>%
  tibble::column_to_rownames(var = "sample")

#Now run the following line and it will provide Pst estimates for every gene.
perc_meth_bins_10kb_Pst <- Pst(perc_meth_bins_10kb_wide, ci=1) #ci=1 to include 95% confidence interval calc 
## [1] "Populations sizes are:"
## HC SS 
##  9  9
#Pst(perc_meth_bins_10kb_wide, ci=1)

# Check out Pst distribution 
hist(perc_meth_bins_10kb_Pst$Pst_Values)

summary(perc_meth_bins_10kb_Pst$Pst_Values)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000065 0.0757692 0.2489184 0.2721605 0.4132089 1.0000000
# Write out Pst results 
write.table(
  perc_meth_bins_10kb_Pst %>% 
    separate(Quant_Varia, into=c("contig","start.bin", "end.bin"), sep = "_", remove = FALSE), 
  file = here::here("analyses", "methylation-filtered", "Pst_bins_10kb.tab"), 
  sep = '\t', na = "NA", row.names = FALSE, col.names = TRUE, quote=FALSE)