list.of.packages <- c("tidyverse", "reshape2", "here", "scales") #add new libraries here
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# Load all libraries
lapply(list.of.packages, FUN = function(X) {
do.call("require", list(X))
})
sessionInfo()
DMLs between the Olympia oyster populations, Hood Canal and South Sound, were identified using MethylKit. File is: analyses/dml25.bed
MACAU was used to identify loci at which methylation is associated with a phenotype, in our case shell length, while controlling for relatedness. ### Locations of feature files
Olurida_v081-20190709.gene.gff - genes, gene = “../genome-features/Olurida_v081-20190709.gene.gff”
Olurida_v081-20190709.gene.2kbslop.gff - genes +/- 2kb, gene2kb = “../genome-features/Olurida_v081-20190709.gene.2kbslop.gff”
Olurida_v081-20190709.CDS.gff - Coding regions of genes, CDS = “../genome-features/Olurida_v081-20190709.CDS.gff”
Olurida_v081-20190709.exon.gff - Exons, exon = “../genome-features/Olurida_v081-20190709.exon.gff”
Olurida_v081-20190709.mRNA.gff - mRNA, mRNA = “../genome-features/Olurida_v081-20190709.mRNA.gff”
Olurida_v081_TE-Cg.gff - Transposable elements, TE = “../genome-features/Olurida_v081_TE-Cg.gff”
20190709-Olurida_v081.stringtie.gtf - alternative splice variants, ASV = “../genome-features/20190709-Olurida_v081.stringtie.gtf”
AllLociMACAU = “../analyses/macau/macau-all-loci.bed” AllLociDMLs = “../analyses/DMLs/mydiff-all.bed”
macau = “../analyses/macau/macau.sign.length.perc.meth.bed”
DML = “../analyses/DMLs/dml25.bed”
I will use bedtools
to identify where DML and MACAU loci intersect with known genome features.
bedtool intersect
options to use:
-u
- Write the original A entry once if any overlaps found in B, i.e. just report the fact >=1 hit was found
-a
- File A
-b
- File B
bedtools intersect -wb -a "../analyses/DMLs/dml25.bed" -b "../genome-features/Olurida_v081-20190709.gene.gff" > ../analyses/DMLs/DML-gene.bed
bedtools intersect -wb -a "../analyses/DMLs/dml25.bed" -b "../genome-features/Olurida_v081-20190709.gene.2kbslop.gff" > ../analyses/DMLs/DML-gene2kb.bed
bedtools intersect -wb -a "../analyses/DMLs/dml25.bed" -b "../genome-features/Olurida_v081-20190709.exon.gff" > ../analyses/DMLs/DML-exon.bed
bedtools intersect -wb -a "../analyses/DMLs/dml25.bed" -b "../genome-features/Olurida_v081-20190709.CDS.gff" > ../analyses/DMLs/DML-CDS.bed
bedtools intersect -wb -a "../analyses/DMLs/dml25.bed" -b "../genome-features/Olurida_v081-20190709.mRNA.gff" > ../analyses/DMLs/DML-mRNA.bed
bedtools intersect -wb -a "../analyses/DMLs/dml25.bed" -b "../genome-features/Olurida_v081_TE-Cg.gff" > ../analyses/DMLs/DML-TE.bed
bedtools intersect -wb -a "../analyses/DMLs/dml25.bed" -b "../genome-features/20190709-Olurida_v081.stringtie.gtf" > ../analyses/DMLs/DML-ASV.bed
bedtools intersect -v -a "../analyses/DMLs/dml25.bed" -b "../genome-features/Olurida_v081-20190709.gene.2kbslop.gff" "../genome-features/Olurida_v081-20190709.exon.gff" "../genome-features/Olurida_v081-20190709.CDS.gff" "../genome-features/Olurida_v081-20190709.mRNA.gff" "../genome-features/Olurida_v081_TE-Cg.gff" "../genome-features/20190709-Olurida_v081.stringtie.gtf" > ../analyses/DMLs/DML-unknown.bed
bedtools intersect -wb -a "../analyses/DMLs/mydiff-all.bed" -b "../genome-features/Olurida_v081-20190709.gene.gff" > ../analyses/DMLs/AllLociDMLs-gene.bed
bedtools intersect -wb -a "../analyses/DMLs/mydiff-all.bed" -b "../genome-features/Olurida_v081-20190709.gene.2kbslop.gff" > ../analyses/DMLs/AllLociDMLs-gene2kb.bed
bedtools intersect -wb -a "../analyses/DMLs/mydiff-all.bed" -b "../genome-features/Olurida_v081-20190709.exon.gff" > ../analyses/DMLs/AllLociDMLs-exon.bed
bedtools intersect -wb -a "../analyses/DMLs/mydiff-all.bed" -b "../genome-features/Olurida_v081-20190709.CDS.gff" > ../analyses/DMLs/AllLociDMLs-CDS.bed
bedtools intersect -wb -a "../analyses/DMLs/mydiff-all.bed" -b "../genome-features/Olurida_v081-20190709.mRNA.gff" > ../analyses/DMLs/AllLociDMLs-mRNA.bed
bedtools intersect -wb -a "../analyses/DMLs/mydiff-all.bed" -b "../genome-features/Olurida_v081_TE-Cg.gff" > ../analyses/DMLs/AllLociDMLs-TE.bed
bedtools intersect -wb -a "../analyses/DMLs/mydiff-all.bed" -b "../genome-features/20190709-Olurida_v081.stringtie.gtf" > ../analyses/DMLs/AllLociDMLs-ASV.bed
bedtools intersect -v -a "../analyses/DMLs/mydiff-all.bed" -b "../genome-features/Olurida_v081-20190709.gene.2kbslop.gff" "../genome-features/Olurida_v081-20190709.exon.gff" "../genome-features/Olurida_v081-20190709.CDS.gff" "../genome-features/Olurida_v081-20190709.mRNA.gff" "../genome-features/Olurida_v081_TE-Cg.gff" "../genome-features/20190709-Olurida_v081.stringtie.gtf" > ../analyses/DMLs/AllLociDMLs-unknown.bed
bedtools intersect -wb -a "../analyses/macau/macau.sign.length.perc.meth.bed" -b "../genome-features/Olurida_v081-20190709.gene.gff" > ../analyses/macau/macau-gene.bed
bedtools intersect -wb -a "../analyses/macau/macau.sign.length.perc.meth.bed" -b "../genome-features/Olurida_v081-20190709.gene.2kbslop.gff" > ../analyses/macau/macau-gene2kb.bed
bedtools intersect -wb -a "../analyses/macau/macau.sign.length.perc.meth.bed" -b "../genome-features/Olurida_v081-20190709.exon.gff" > ../analyses/macau/macau-exon.bed
bedtools intersect -wb -a "../analyses/macau/macau.sign.length.perc.meth.bed" -b "../genome-features/Olurida_v081-20190709.CDS.gff" > ../analyses/macau/macau-CDS.bed
bedtools intersect -wb -a "../analyses/macau/macau.sign.length.perc.meth.bed" -b "../genome-features/Olurida_v081-20190709.mRNA.gff" > ../analyses/macau/macau-mRNA.bed
bedtools intersect -wb -a "../analyses/macau/macau.sign.length.perc.meth.bed" -b "../genome-features/Olurida_v081_TE-Cg.gff" > ../analyses/macau/macau-TE.bed
bedtools intersect -wb -a "../analyses/macau/macau.sign.length.perc.meth.bed" -b "../genome-features/20190709-Olurida_v081.stringtie.gtf" > ../analyses/macau/macau-ASV.bed
bedtools intersect -v -a "../analyses/macau/macau.sign.length.perc.meth.bed" -b "../genome-features/Olurida_v081-20190709.gene.2kbslop.gff" "../genome-features/Olurida_v081-20190709.exon.gff" "../genome-features/Olurida_v081-20190709.CDS.gff" "../genome-features/Olurida_v081-20190709.mRNA.gff" "../genome-features/Olurida_v081_TE-Cg.gff" "../genome-features/20190709-Olurida_v081.stringtie.gtf" > ../analyses/macau/macau-unknown.bed
bedtools intersect -wb -a "../analyses/macau/macau-all-loci.bed" -b "../genome-features/Olurida_v081-20190709.gene.gff" > ../analyses/macau/AllLociMACAU-gene.bed
bedtools intersect -wb -a "../analyses/macau/macau-all-loci.bed" -b "../genome-features/Olurida_v081-20190709.gene.2kbslop.gff" > ../analyses/macau/AllLociMACAU-gene2kb.bed
bedtools intersect -wb -a "../analyses/macau/macau-all-loci.bed" -b "../genome-features/Olurida_v081-20190709.exon.gff" > ../analyses/macau/AllLociMACAU-exon.bed
bedtools intersect -wb -a "../analyses/macau/macau-all-loci.bed" -b "../genome-features/Olurida_v081-20190709.CDS.gff" > ../analyses/macau/AllLociMACAU-CDS.bed
bedtools intersect -wb -a "../analyses/macau/macau-all-loci.bed" -b "../genome-features/Olurida_v081-20190709.mRNA.gff" > ../analyses/macau/AllLociMACAU-mRNA.bed
bedtools intersect -wb -a "../analyses/macau/macau-all-loci.bed" -b "../genome-features/Olurida_v081_TE-Cg.gff" > ../analyses/macau/AllLociMACAU-TE.bed
bedtools intersect -wb -a "../analyses/macau/macau-all-loci.bed" -b "../genome-features/20190709-Olurida_v081.stringtie.gtf" > ../analyses/macau/AllLociMACAU-ASV.bed
bedtools intersect -v -a "../analyses/macau/macau-all-loci.bed" -b "../genome-features/Olurida_v081-20190709.gene.2kbslop.gff" "../genome-features/Olurida_v081-20190709.exon.gff" "../genome-features/Olurida_v081-20190709.CDS.gff" "../genome-features/Olurida_v081-20190709.mRNA.gff" "../genome-features/Olurida_v081_TE-Cg.gff" "../genome-features/20190709-Olurida_v081.stringtie.gtf" > ../analyses/macau/AllLociMACAU-unknown.bed
NOTE: remove “eval=FALSE” to execute this code chunk
#curl https://raw.githubusercontent.com/sr320/paper-oly-mbdbs-gen/master/analyses/Olgene_blastx_uniprot.05.tab > ../data/Olgene_blastx_uniprot.05.tab
tr '|' '\t' < ../data/Olgene_blastx_uniprot.05.tab \
> ../data/Olgene_blastx_uniprot.05-20191122.tab
wc -l ../data/Olgene_blastx_uniprot.05-20191122.tab
## 11803 ../data/Olgene_blastx_uniprot.05-20191122.tab
awk -v OFS='\t' '{print $1, $3, $13}' \
< ../data/Olgene_blastx_uniprot.05-20191122.tab | sort \
> ../data/Olgene_blastx_uniprot.05-20191122-sort.tab
wc -l ../data/Olgene_blastx_uniprot.05-20191122-sort.tab
## 11803 ../data/Olgene_blastx_uniprot.05-20191122-sort.tab
head ../data/Olgene_blastx_uniprot.05-20191122-sort.tab
## Contig100018:1232-2375 P31695 2.23e-06
## Contig100073:8284-10076 H2A0L8 6.98e-24
## Contig100101:2158-2821 O35796 3.67e-28
## Contig100107:1089-2009 Q2KMM2 8.78e-15
## Contig100114:437-2094 Q9V4M2 1.41e-09
## Contig100163:2402-6678 P23708 2.55e-18
## Contig100166:542-2054 G5EBR3 2.08e-11
## Contig100170:472-1350 Q5F3T9 9.14e-42
## Contig100188:460-2761 Q8TD26 1.35e-18
## Contig100206:5719-12338 Q2HJH1 1.51e-14
uniprot <- read_delim(here::here("data", "Olgene_blastx_uniprot.05-20191122-sort.tab"), delim="\t", col_names = FALSE) %>%
setNames(c("gene", "UniprotID", "unknown")) %>%
separate(gene, into = c("contig.feat", "start.feat", "end.feat"), sep = '[:-]') %>%
mutate_at(c("start.feat", "end.feat"),as.numeric)
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_character(),
## X3 = col_double()
## )
DMLfiles <- c("DML-CDS.bed", "DML-exon.bed", "DML-gene.bed", "DML-gene2kb.bed", "DML-mRNA.bed", "DML-TE.bed", "DML-ASV.bed", "DML-unknown.bed")
DML.features <- list()
for (i in c(1:8)) {
DML.features[[i]] <- read_delim(here::here("analyses", "DMLs", DMLfiles[i]), delim = '\t', col_names = FALSE) %>% as_tibble()}
for (i in 1:7) {
DML.features[[i]] <- DML.features[[i]] %>%
setNames(c("contig.dml","start.dml","end.dml","score.dml","contig.feat", "source.feat","feature","start.feat","end.feat","unknown1","strand","unknown2","attribute")) %>%
mutate(ID=str_extract(attribute, "ID=(.*?);"),
Parent=str_extract(attribute, "Parent=(.*?);"),
Name=str_extract(attribute, "Name=(.*?);"),
Alias=str_extract(attribute, "Alias=(.*?);"),
AED=str_extract(attribute, "AED=(.*?);"),
eAED=str_extract(attribute, "eAED=(.*?);"),
Note=str_extract(attribute, "Note=(.*?);"),
Ontology_term=str_extract(attribute, "Ontology_term=(.*?);"),
Dbxref=str_extract(attribute, "Dbxref=(.*?);")
) %>%
mutate_at("feature", as.factor)
}
names(DML.features) <- c("DML.CDS", "DML.exon", "DML.gene","DML.gene2kb", "DML.mRNA", "DML.TE", "DML.ASV", "DML.unknown")
DML.features[["DML.unknown"]] <-
DML.features[["DML.unknown"]] %>%
setNames(c("contig.DML", "start.dml", "end.dml", "score.dml"))
DML.features[["DML.CDS"]] <- DML.features[["DML.CDS"]] %>% mutate_at("unknown2", as.character)
DML.features[["DML.TE"]] <- DML.features[["DML.TE"]] %>% mutate_at("unknown1", as.character)
DML.features[["DML.ASV"]] <- DML.features[["DML.ASV"]] %>% mutate_at("unknown1", as.character)
DML.features[["DML.gene2kb"]]$feature <- "gene2kb"
DML.features[["DML.unknown"]]$feature <- "unknown"
DML.features.df <- bind_rows(DML.features[-7])
print(DML.summary <- table(DML.features.df[c("feature")])) #Note: "similarity" refers to transposable elements; also NO alternative splice variants are included.
##
## CDS exon gene gene2kb mRNA similarity unknown
## 185 190 219 255 219 9 25
#
DML.features[[3]] %>% left_join(uniprot, by=c("contig.feat","start.feat", "end.feat"))
## # A tibble: 219 x 24
## contig.dml start.dml end.dml score.dml contig.feat source.feat feature
## <chr> <dbl> <dbl> <dbl> <chr> <chr> <fct>
## 1 Contig100… 3427 3429 -31.8 Contig1009… maker gene
## 2 Contig101… 3024 3026 26.3 Contig1016… maker gene
## 3 Contig103… 7053 7055 -36.2 Contig1035… maker gene
## 4 Contig105… 1696 1698 -29.1 Contig1052… maker gene
## 5 Contig109… 3377 3379 54.2 Contig1095… maker gene
## 6 Contig114… 412 414 -25.1 Contig1144… maker gene
## 7 Contig118… 327 329 37.1 Contig1181… maker gene
## 8 Contig1245 9228 9230 -55.3 Contig1245 maker gene
## 9 Contig1270 4726 4728 38.7 Contig1270 maker gene
## 10 Contig128… 3369 3371 -27.6 Contig1280… maker gene
## # … with 209 more rows, and 17 more variables: start.feat <dbl>,
## # end.feat <dbl>, unknown1 <chr>, strand <chr>, unknown2 <chr>,
## # attribute <chr>, ID <chr>, Parent <chr>, Name <chr>, Alias <chr>,
## # AED <chr>, eAED <chr>, Note <chr>, Ontology_term <chr>, Dbxref <chr>,
## # UniprotID <chr>, unknown <dbl>
#save DML loci feature df object to file, to use in notebook #12
save(DML.features.df, file="../analyses/DMLs/R-objects/DML.features.df")
save(DML.summary, file="../analyses/DMLs/R-objects/DML.summary")
macau.files <- c("macau-CDS.bed", "macau-exon.bed", "macau-gene.bed", "macau-gene2kb.bed", "macau-mRNA.bed", "macau-ASV.bed", "macau-TE.bed", "macau-unknown.bed")
macau.features <- list()
for (i in c(1:8)) {
macau.features[[i]] <- read_delim(here::here("analyses", "macau", macau.files[i]), delim = '\t', col_names = FALSE) %>% as_tibble()}
for (i in 1:6) {
macau.features[[i]] <- macau.features[[i]] %>%
setNames(c("contig.macau","start.macau","end.macau","contig.feat", "source.feat","feature","start.feat","end.feat","unknown1","strand","unknown2","attribute")) %>%
mutate(ID=str_extract(attribute, "ID=(.*?);"),
Parent=str_extract(attribute, "Parent=(.*?);"),
Name=str_extract(attribute, "Name=(.*?);"),
Alias=str_extract(attribute, "Alias=(.*?);"),
AED=str_extract(attribute, "AED=(.*?);"),
eAED=str_extract(attribute, "eAED=(.*?);"),
Note=str_extract(attribute, "Note=(.*?);"),
Ontology_term=str_extract(attribute, "Ontology_term=(.*?);"),
Dbxref=str_extract(attribute, "Dbxref=(.*?);")
) %>%
mutate_at("feature", as.factor)
}
names(macau.features) <- c("macau.CDS", "macau.exon","macau.gene", "macau.gene2kb","macau.mRNA", "macau.ASV", "macau.TE", "macau.unknown")
macau.features[["macau.CDS"]] <- macau.features[["macau.CDS"]] %>% mutate_at("unknown2", as.character)
macau.features[["macau.ASV"]] <- macau.features[["macau.ASV"]] %>% mutate_at("unknown1", as.character)
macau.features[["macau.gene2kb"]]$feature <- "gene2kb"
#macau.features[["macau.unknown"]]$feature <- "unknown"
macau.features.df <- bind_rows(macau.features[-6:-7])
print(macau.summary <- table(macau.features.df[c("feature")])) #NOTE this does not contain alternative splice variants. There are zero transposable elements and unknown loci.
##
## CDS exon gene gene2kb mRNA
## 15 16 17 18 17
save(macau.summary, file="../analyses/macau/R-objects/macau.summary")
#macau.features[["macau.gene"]] %>% inner_join(uniprot, by=c("contig.feat","start.feat", "end.feat")) #not sure if this is working?
load(here::here("analyses", "methylation-filtered", "R-objects", "meth_filter_reshaped"))
meth_filter_calcs <- meth_filter_reshaped %>%
group_by(population, chr, start) %>%
dplyr::summarise(
mean_percMeth = mean(percMeth, na.rm=TRUE),
sd_percMeth=sd(percMeth, na.rm=TRUE),
n())
meth_filter_calcs %>%
filter(chr %in% macau.features[["macau.gene2kb"]]$contig.macau &
start %in% macau.features[["macau.gene2kb"]]$start.macau) %>%
ggplot(aes(x = population, y = mean_percMeth, fill = population,
label=paste0(round(mean_percMeth, digits = 2), "%"))) +
geom_bar(stat="identity", width = 0.5) + ylim(0,110) +
geom_pointrange(aes(ymin=mean_percMeth,
ymax=mean_percMeth+sd_percMeth, width=0.15)) +
geom_text(size=3, vjust=-0.5, hjust=1.25) +
theme_light() + ggtitle("") + facet_wrap(~chr) +
scale_fill_manual(values=c("firebrick3","dodgerblue3"))
## Warning: Ignoring unknown aesthetics: width
allLociDMLfiles <- c("AllLociDMLs-CDS.bed", "AllLociDMLs-exon.bed", "AllLociDMLs-gene.bed", "AllLociDMLs-gene2kb.bed", "AllLociDMLs-mRNA.bed", "AllLociDMLs-TE.bed", "AllLociDMLs-ASV.bed", "AllLociDMLs-unknown.bed")
allLociDML.features <- list()
for (i in c(1:8)) {
allLociDML.features[[i]] <- read_delim(here::here("analyses","DMLs", allLociDMLfiles[i]), delim = '\t', col_names = FALSE) %>% as_tibble()}
for (i in 1:7) {
allLociDML.features[[i]] <- allLociDML.features[[i]] %>%
setNames(c("contig.allLoci","start.allLoci","end.allLoci", "unknown", "contig.feat", "source.feat","feature","start.feat","end.feat","unknown1","strand","unknown2","attribute")) %>%
mutate(ID=str_extract(attribute, "ID=(.*?);"),
Parent=str_extract(attribute, "Parent=(.*?);"),
Name=str_extract(attribute, "Name=(.*?);"),
Alias=str_extract(attribute, "Alias=(.*?);"),
AED=str_extract(attribute, "AED=(.*?);"),
eAED=str_extract(attribute, "eAED=(.*?);"),
Note=str_extract(attribute, "Note=(.*?);"),
Ontology_term=str_extract(attribute, "Ontology_term=(.*?);"),
Dbxref=str_extract(attribute, "Dbxref=(.*?);")
) %>%
mutate_at("feature", as.factor)
}
#allLociDML.features[[8]] <- allLociDML.features[[8]] %>%
# setNames(c("contig.feat", "start.feat","end.feat","unknown"))
names(allLociDML.features) <- c("allLociDML.CDS","allLociDML.exon","allLociDML.gene","allLociDML.gene2kb","allLociDML.mRNA","allLociDML.TE","allLociDML.ASV","allLociDML.unknown")
allLociDML.features[["allLociDML.unknown"]] <-
allLociDML.features[["allLociDML.unknown"]] %>%
setNames(c("contig.allLoci", "start.allLoci", "end.allLoci", "unkown"))
allLociDML.features[["allLociDML.CDS"]] <- allLociDML.features[["allLociDML.CDS"]] %>% mutate_at("unknown2", as.character)
allLociDML.features[["allLociDML.ASV"]] <- allLociDML.features[["allLociDML.ASV"]] %>% mutate_at("unknown1", as.character)
allLociDML.features[["allLociDML.gene2kb"]]$feature <- "gene2kb"
allLociDML.features[["allLociDML.unknown"]]$feature <- "unknown"
allLociDML.features.df <- bind_rows(allLociDML.features[-7]) #don't include alternative splice variants or unknown
print(allLociDML.summary <- table(allLociDML.features.df[c("feature")])) #Note, "similarity"= transposable elements
##
## CDS exon gene gene2kb mRNA similarity unknown
## 15322 15943 18688 21073 18688 1588 4156
save(allLociDML.summary, file="../analyses/DMLs/R-objects/allLociDML.summary")
allLociMACAUfiles <- c("AllLociMACAU-CDS.bed", "AllLociMACAU-exon.bed", "AllLociMACAU-gene.bed", "AllLociMACAU-gene2kb.bed", "AllLociMACAU-mRNA.bed", "AllLociMACAU-TE.bed", "AllLociMACAU-ASV.bed", "AllLociMACAU-unknown.bed")
allLociMACAU.features <- list()
for (i in c(1:8)) {
allLociMACAU.features[[i]] <- read_delim(here::here("analyses","macau", allLociMACAUfiles[i]), delim = '\t', col_names = FALSE) %>% as_tibble()}
for (i in 1:7) {
allLociMACAU.features[[i]] <- allLociMACAU.features[[i]] %>%
setNames(c("contig.allLoci","start.allLoci","end.allLoci", "contig.feat", "source.feat","feature","start.feat","end.feat","unknown1","strand","unknown2","attribute")) %>%
mutate(ID=str_extract(attribute, "ID=(.*?);"),
Parent=str_extract(attribute, "Parent=(.*?);"),
Name=str_extract(attribute, "Name=(.*?);"),
Alias=str_extract(attribute, "Alias=(.*?);"),
AED=str_extract(attribute, "AED=(.*?);"),
eAED=str_extract(attribute, "eAED=(.*?);"),
Note=str_extract(attribute, "Note=(.*?);"),
Ontology_term=str_extract(attribute, "Ontology_term=(.*?);"),
Dbxref=str_extract(attribute, "Dbxref=(.*?);")
) %>%
mutate_at("feature", as.factor)
}
names(allLociMACAU.features) <- c("allLociMACAU.CDS","allLociMACAU.exon","allLociMACAU.gene","allLociMACAU.gene2kb","allLociMACAU.mRNA","allLociMACAU.TE","allLociMACAU.ASV","allLociMACAU.unknown")
allLociMACAU.features[["allLociMACAU.unknown"]] <-
allLociMACAU.features[["allLociMACAU.unknown"]] %>%
setNames(c("contig.allLoci", "start.allLoci", "end.allLoci"))
allLociMACAU.features[["allLociMACAU.CDS"]] <- allLociMACAU.features[["allLociMACAU.CDS"]] %>% mutate_at("unknown2", as.character)
allLociMACAU.features[["allLociMACAU.ASV"]] <- allLociMACAU.features[["allLociMACAU.ASV"]] %>% mutate_at("unknown1", as.character)
allLociMACAU.features[["allLociMACAU.gene2kb"]]$feature <- "gene2kb"
allLociMACAU.features[["allLociMACAU.unknown"]]$feature <- "unknown"
allLociMACAU.features.df <- bind_rows(allLociMACAU.features[-7]) #don't include alternative splice variants or unknown
print(allLociMACAU.summary <- table(allLociMACAU.features.df[c("feature")])) #Note, "similartiy"= transposable elements
##
## CDS exon gene gene2kb mRNA similarity unknown
## 15221 15837 18535 20875 18535 1517 4038
save(allLociMACAU.summary, file="../analyses/macau/R-objects/allLociMACAU.summary")
load("../analyses/DMLs/R-objects/dml25") # load df for no. of DMLs (including those not annotated to known features)
load("../analyses/DMLs/R-objects/mydiff.all") # load df for total no. of loci (which were analyzed for DML)
load("../analyses/macau/R-objects/macau.FDR.length") #load df for total no. of MACAU loci analyzed & no. of sign. loci
# Add row to summary dataframe with total number of loci in DMLs, all loci
# Create single summary df showing where DML loci and all loci analyzed for DML are located
loci.locations <- merge(x=merge(x=merge(x=melt(DML.summary, varnames = "feature", value.name = "DML"),
y=melt(macau.summary, varnames = "feature", value.name = "macau"),
by="feature", all=TRUE),
y=melt(allLociDML.summary, varnames = "feature", value.name = "allLociDML"),
by="feature", all=TRUE),
y=melt(allLociMACAU.summary, varnames = "feature", value.name = "allLociMACAU"),
by="feature", all=TRUE) %>%
mutate(feature = as.character(feature)) %>%
mutate(DML=as.numeric(DML), allLociDML=as.numeric(allLociDML))
# DML=as.numeric(DML), allLociDML=as.numeric(allLociDML)) %>%
loci.locations <- loci.locations %>%
rbind(c("geneflank2kb",
loci.locations[loci.locations$feature=="gene2kb","DML"]-loci.locations[loci.locations$feature=="gene","DML"],
loci.locations[loci.locations$feature=="gene2kb","macau"]-loci.locations[loci.locations$feature=="gene","macau"],
loci.locations[loci.locations$feature=="gene2kb","allLociDML"]-loci.locations[loci.locations$feature=="gene","allLociDML"],
loci.locations[loci.locations$feature=="gene2kb","allLociMACAU"]-loci.locations[loci.locations$feature=="gene","allLociMACAU"])) %>%
rbind(c("all", as.numeric(nrow(dml25)), as.numeric(nrow(subset(macau.FDR.length, significant=="TRUE"))), as.numeric(nrow(mydiff.all)), as.numeric(nrow(macau.FDR.length)))) %>%
mutate_at(vars(-feature), funs(as.numeric)) %>%
mutate(DMLperc=DML/nrow(dml25),
MACAUperc=macau/nrow(subset(macau.FDR.length, significant=="TRUE")),
AllDMLperc=allLociDML/nrow(mydiff.all),
AllMACAUperc=allLociMACAU/nrow(macau.FDR.length))
save(loci.locations, file="../analyses/methylation-filtered/R-objects/loci.locations")
loci.locations.long <- cbind(melt(loci.locations[,c("feature", "DML", "macau", "allLociDML", "allLociMACAU")],
variable.name = "analysis", value.name = "count"),
melt(loci.locations[,c("feature", "DMLperc", "MACAUperc", "AllDMLperc", "AllMACAUperc")],
variable.name = "analysis", value.name = "percent"))[,c(1,2,3,6)] %>%
mutate(analysis=fct_relevel(analysis, c("DML", "allLociDML", "macau", "allLociMACAU")),
feature=as.factor(str_replace(feature, "similarity", "TE")))
save(loci.locations.long, file="../analyses/methylation-filtered/R-objects/loci.locations.long")
ggplot(data=subset(loci.locations.long, feature=="gene" | feature=="geneflank2kb" | feature=="exon" | feature=="TE" | feature=="unknown"), aes(x=analysis, y=percent, fill=feature, label=percent(percent, accuracy = 0.1))) + #prettyNum(count, big.mark = ",")
# geom_bar(stat="identity", width = .5) +
geom_bar(position="fill", stat="identity", width=0.5) +
labs(y="% of Loci Assessed", x=NULL) +
scale_fill_manual(name = "Loci Location", labels = c("Exon", "Gene Body", "Gene Flanking Regions (+/-2kb)", "Transposable Elements", "Unknown Regions"),
values=c("#a6cee3", "#1f78b4", "#b2df8a","#33a02c", "#fb9a99")) +
ggtitle("Locations of DML and size-associated (MACAU) loci in genome\n with background loci locations") +
theme_minimal() + geom_text(size = 3, position = position_fill(vjust = 0.5)) +
scale_x_discrete(labels=c("DML" = "DMLs",
"allLociDML" = "DML Background\nLoci",
"macau" = "Size-Associated\nLoci (MACAU)",
"allLociMACAU" = "MACAU\nBackground\nLoci"))
## Warning: Removed 2 rows containing missing values (position_stack).
## Warning: Removed 2 rows containing missing values (position_stack).
###### ====== DML loci GO terms
# Count string lengths in the Ontology_term column and check out - to figure out max # GO terms
#data.frame(ontology=DML.features.df$Ontology_term,chr=apply(DML.features.df[,"Ontology_term"],2,nchar)) # %>% View()
DML.GO <- DML.features.df[c("contig.dml", "start.dml", "end.dml", "feature", "start.feat", "end.feat", "Note", "Ontology_term")] %>%
distinct(contig.dml, start.dml, Note, Ontology_term, .keep_all = TRUE) %>%
mutate(Ontology_term = str_replace(Ontology_term, pattern="Ontology_term=",replacement = "")) %>%
mutate(Ontology_term = str_replace(Ontology_term, pattern=";",replacement = "")) %>%
separate(Ontology_term, sep=",", into=paste("GO", 1:11, sep="_")) %>%
pivot_longer(cols=c("GO_1","GO_2","GO_3","GO_4","GO_5","GO_6","GO_7","GO_8","GO_9","GO_10","GO_11"), names_to = "GO_number", values_to = "GO_term") %>%
select(-GO_number) %>%
filter(!is.na(Note) & !is.na(GO_term))
## Warning: Expected 11 pieces. Missing pieces filled with `NA` in 108 rows [190,
## 192, 198, 202, 205, 206, 212, 213, 214, 217, 218, 224, 231, 232, 233, 235, 236,
## 237, 238, 239, ...].
write_csv(DML.features.df, path = here::here("analyses/", "DMLs", "DML.features.txt")) #write out df with all DML features
write(DML.GO$GO_term, file = here::here("analyses/", "DMLs", "DML.GO.txt"))
write.table(subset(DML.features.df, feature=="gene"), here::here("analyses/", "DMLs", "DML.geneinfo.txt"), sep = "\t") #write out gene info for DMLs to associate with UniprotIDs
###### ====== MACAU loci GO terms
# Count string lengths in the Ontology_term column and check out - to figure out max # GO terms
#data.frame(ontology=macau.features.df$Ontology_term,chr=apply(macau.features.df[,"Ontology_term"],2,nchar)) #%>% View()
macau.GO <- macau.features.df[c("contig.macau", "start.macau", "end.macau", "feature", "start.feat", "end.feat", "Note", "Ontology_term")] %>%
distinct(contig.macau, start.macau, Note, Ontology_term, .keep_all = TRUE) %>%
mutate(Ontology_term = str_replace(Ontology_term, pattern="Ontology_term=",replacement = "")) %>%
mutate(Ontology_term = str_replace(Ontology_term, pattern=";",replacement = "")) %>%
separate(Ontology_term, sep=",", into=paste("GO", 1:6, sep="_")) %>%
pivot_longer(cols=c("GO_1","GO_2","GO_3","GO_4","GO_5","GO_6"), names_to = "GO_number", values_to = "GO_term") %>%
select(-GO_number) %>%
filter(!is.na(Note) & !is.na(GO_term))
## Warning: Expected 6 pieces. Missing pieces filled with `NA` in 8 rows [16, 19,
## 21, 22, 24, 28, 31, 32].
write_csv(macau.features.df, path = here::here("analyses/", "macau/", "macau.features.csv")) #write out df with all MACAU features
write(macau.GO$GO_term, file = here::here("analyses/", "macau/", "macau.GO.txt")) #write out df with just GO terms
write.table(subset(macau.features.df, feature=="gene"), here::here("analyses/", "macau/", "MACAU.geneinfo.txt"), sep = "\t") #write out gene info for MACAU loci to associate with UniprotIDs
###### ====== DML Background loci GO terms
# Count string lengths in the Ontology_term column and check out - to figure out max # GO terms
#data.frame(ontology=allLociDML.features.df$Ontology_term,chr=apply(allLociDML.features.df[,"Ontology_term"],2,nchar)) # %>% View()
allLociDML.GO <- allLociDML.features.df[c("contig.allLoci", "start.allLoci", "end.allLoci", "feature", "start.feat", "end.feat", "Note", "Ontology_term")] %>%
distinct(contig.allLoci, start.allLoci, Note, Ontology_term, .keep_all = TRUE) %>%
mutate(Ontology_term = str_replace(Ontology_term, pattern="Ontology_term=",replacement = "")) %>%
mutate(Ontology_term = str_replace(Ontology_term, pattern=";",replacement = "")) %>%
separate(Ontology_term, sep=",", into=paste("GO", 1:11, sep="_")) %>%
pivot_longer(cols=c("GO_1","GO_2","GO_3","GO_4","GO_5","GO_6","GO_7","GO_8","GO_9","GO_10","GO_11"), names_to = "GO_number", values_to = "GO_term") %>%
select(-GO_number) %>%
filter(!is.na(Note) & !is.na(GO_term))
## Warning: Expected 11 pieces. Missing pieces filled with `NA` in 9269 rows
## [15729, 15730, 15731, 15732, 15733, 15734, 15735, 15736, 15737, 15738, 15739,
## 15740, 15741, 15742, 15743, 15744, 15745, 15765, 15766, 15773, ...].
write_csv(allLociDML.features.df, path = here::here("analyses/", "DMLs", "allLociDML.features.txt")) #write out df with all loci features
write(allLociDML.GO$GO_term, file = here::here("analyses/", "DMLs", "allLociDML.GO.txt"))
write.table(subset(allLociDML.features.df, feature=="gene"), here::here("analyses/", "DMLs", "allLociDML.geneinfo.txt"), sep = "\t") #write out gene info for All loci fed into DMLs to associate with UniprotIDs
#save all loci feature df object to file, to use in notebook #12
save(allLociDML.features.df, file=here::here("analyses", "DMLs", "R-objects", "allLociDML.features.df"))
###### ====== MACAU Background loci GO terms
# Count string lengths in the Ontology_term column and check out - to figure out max # GO terms
#data.frame(ontology=allLociMACAU.features.df$Ontology_term,chr=apply(allLociMACAU.features.df[,"Ontology_term"],2,nchar)) #%>% View()
allLociMACAU.GO <- allLociMACAU.features.df[c("contig.allLoci", "start.allLoci", "end.allLoci", "feature", "start.feat", "end.feat", "Note", "Ontology_term")] %>%
distinct(contig.allLoci, start.allLoci, Note, Ontology_term, .keep_all = TRUE) %>%
mutate(Ontology_term = str_replace(Ontology_term, pattern="Ontology_term=",replacement = "")) %>%
mutate(Ontology_term = str_replace(Ontology_term, pattern=";",replacement = "")) %>%
separate(Ontology_term, sep=",", into=paste("GO", 1:11, sep="_")) %>%
pivot_longer(cols=c("GO_1","GO_2","GO_3","GO_4","GO_5","GO_6","GO_7","GO_8","GO_9","GO_10","GO_11"), names_to = "GO_number", values_to = "GO_term") %>%
select(-GO_number) %>%
filter(!is.na(Note) & !is.na(GO_term))
## Warning: Expected 11 pieces. Missing pieces filled with `NA` in 9194 rows
## [15623, 15624, 15625, 15626, 15627, 15628, 15629, 15630, 15631, 15632, 15633,
## 15634, 15635, 15636, 15637, 15638, 15639, 15659, 15660, 15667, ...].
write_csv(allLociMACAU.features.df, path = here::here("analyses/", "macau", "allLociMACAU.features.txt")) #write out df with all loci features
write(allLociMACAU.GO$GO_term, file = here::here("analyses/", "macau", "allLociMACAU.GO.txt"))
write.table(subset(allLociMACAU.features.df, feature=="gene"), here::here("analyses/", "macau", "allLociMACAU.geneinfo.txt"), sep = "\t") #write out gene info for All loci fed into MACAU to associate with UniprotIDs
#save all loci feature df object to file, to use in notebook #12
save(allLociMACAU.features.df, file=here::here("analyses", "macau", "R-objects", "allLociMACAU.features.df"))
hypo_HC_genes <-
DML.features.df %>%
filter(contig.dml %in% hypo_HC$chr &
start.dml %in% 1+hypo_HC$start &
feature == "gene" &
Note != "Note=Protein of unknown function;") %>%
distinct(contig.dml, start.dml, Note, Ontology_term, .keep_all = TRUE)
hypo_HC_GO <- hypo_HC_genes %>%
mutate(Ontology_term = str_replace(Ontology_term, pattern="Ontology_term=",replacement = "")) %>%
mutate(Ontology_term = str_replace(Ontology_term, pattern=";",replacement = "")) %>%
separate(Ontology_term, sep=",", into=paste("GO", 1:3, sep="_")) %>%
pivot_longer(cols=c("GO_1","GO_2","GO_3"), names_to = "GO_number", values_to = "GO_term") %>%
select(-GO_number) %>%
filter(!is.na(Note) & !is.na(GO_term))
write(hypo_HC_GO$GO_term, file = here::here("analyses/", "hypo_HC_GO.txt"))
hypo_SS_genes <- DML.features.df %>%
filter(contig.dml %in% hypo_SS$chr &
start.dml %in% 1+hypo_SS$start &
feature == "gene" &
Note != "Note=Protein of unknown function;") %>%
distinct(contig.dml, start.dml, Note, Ontology_term, .keep_all = TRUE)
hypo_SS_GO <- hypo_SS_genes %>%
mutate(Ontology_term = str_replace(Ontology_term, pattern="Ontology_term=",replacement = "")) %>%
mutate(Ontology_term = str_replace(Ontology_term, pattern=";",replacement = "")) %>%
separate(Ontology_term, sep=",", into=paste("GO", 1:5, sep="_")) %>%
pivot_longer(cols=c("GO_1","GO_2","GO_3", "GO_4", "GO_5"), names_to = "GO_number", values_to = "GO_term") %>%
select(-GO_number) %>%
filter(!is.na(Note) & !is.na(GO_term))
write(hypo_HC_GO$GO_term, file = here::here("analyses/", "hypo_SS_GO.txt"))
# Barplots of differentially methylated loci that are located in known genes
hypo_HC %>%
filter(chr %in% hypo_HC_genes$contig.dml &
start %in% hypo_HC_genes$start.dml-1) %>%
ggplot(aes(x = population, y = mean_percMeth, fill = population,
label=paste0(round(mean_percMeth, digits = 2), "%"))) +
geom_bar(stat="identity", width = 0.5) + ylim(0,110) +
geom_pointrange(aes(ymin=mean_percMeth,
ymax=mean_percMeth+sd_percMeth, width=0.15)) +
geom_text(size=3, vjust=-0.5, hjust=1.1) +
theme_minimal() + xlab("Population") + ylab("% Methylation") +
scale_y_continuous(breaks=c(0,25,50,75,100)) +
scale_fill_manual(values=c("firebrick3","dodgerblue3")) +
facet_wrap_paginate(~chr + start, nrow=2, ncol=3, page=2)
hypo_SS %>%
filter(chr %in% hypo_SS_genes$contig.dml &
start %in% hypo_SS_genes$start.dml-1) %>%
ggplot(aes(x = population, y = mean_percMeth, fill = population,
label=paste0(round(mean_percMeth, digits = 2), "%"))) +
geom_bar(stat="identity", width = 0.5) + ylim(0,110) +
geom_pointrange(aes(ymin=mean_percMeth,
ymax=mean_percMeth+sd_percMeth, width=0.15)) +
geom_text(size=3, vjust=-0.5, hjust=1.1) +
theme_minimal() + xlab("Population") + ylab("% Methylation") +
scale_y_continuous(breaks=c(0,25,50,75,100)) +
scale_fill_manual(values=c("firebrick3","dodgerblue3")) +
facet_wrap_paginate(~chr + start, nrow=2, ncol=3, page=2)
echo "Loci differentially methylated between SS and HC populations:"
cat "../analyses/dml25.bed" | wc -l
echo "Loci that overlap with genes:"
bedtools intersect -u -a "../analyses/dml25.bed" -b "../genome-features/Olurida_v081-20190709.gene.gff" | wc -l
echo "Loci that overlap with gene regions (genes +/- 2kb):"
bedtools intersect -u -a "../analyses/dml25.bed" -b "../genome-features/Olurida_v081-20190709.gene.2kbslop.gff" | wc -l
echo "Loci that overlap with exons:"
bedtools intersect -u -a "../analyses/dml25.bed" -b "../genome-features/Olurida_v081-20190709.exon.gff" | wc -l
echo "Loci that overlap with coding sequences:"
bedtools intersect -u -a "../analyses/dml25.bed" -b "../genome-features/Olurida_v081-20190709.CDS.gff" | wc -l
echo "Loci that overlap with mRNA:"
bedtools intersect -u -a "../analyses/dml25.bed" -b "../genome-features/Olurida_v081-20190709.mRNA.gff" | wc -l
echo "Loci that overlap with transposable elements:"
bedtools intersect -u -a "../analyses/dml25.bed" -b "../genome-features/Olurida_v081_TE-Cg.gff" | wc -l
echo "Loci that overlap with alternative splice variants:"
bedtools intersect -u -a "../analyses/dml25.bed" -b "../genome-features/20190709-Olurida_v081.stringtie.gtf" | wc -l
echo "Loci that do not overlap with known features:"
bedtools intersect -u -a "../analyses/dml25.bed" -b "../genome-features/Olurida_v081-20190709.gene.gff" "../genome-features/Olurida_v081-20190709.gene.2kbslop.gff" "../genome-features/Olurida_v081-20190709.exon.gff" "../genome-features/Olurida_v081-20190709.CDS.gff" "../genome-features/Olurida_v081-20190709.mRNA.gff" "../genome-features/Olurida_v081_TE-Cg.gff" "../genome-features/20190709-Olurida_v081.stringtie.gtf" | wc -l
echo "genes"
bedtools intersect -u -a "../analyses/mydiff-all.bed" -b "../genome-features/Olurida_v081-20190709.gene.gff" | wc -l
echo "gene regions (+/- 2kb)"
bedtools intersect -u -a "../analyses/mydiff-all.bed" -b "../genome-features/Olurida_v081-20190709.gene.2kbslop.gff" | wc -l
echo "exon"
bedtools intersect -u -a "../analyses/mydiff-all.bed" -b "../genome-features/Olurida_v081-20190709.exon.gff" | wc -l
echo "CDS"
bedtools intersect -u -a "../analyses/mydiff-all.bed" -b "../genome-features/Olurida_v081-20190709.CDS.gff" | wc -l
echo "mRNA"
bedtools intersect -u -a "../analyses/mydiff-all.bed" -b "../genome-features/Olurida_v081-20190709.mRNA.gff" | wc -l
echo "TE"
bedtools intersect -u -a "../analyses/mydiff-all.bed" -b "../genome-features/Olurida_v081_TE-Cg.gff" | wc -l
echo "ASV"
bedtools intersect -u -a "../analyses/mydiff-all.bed" -b "../genome-features/20190709-Olurida_v081.stringtie.gtf" | wc -l
echo "unknown"
bedtools intersect -v -a "../analyses/mydiff-all.bed" -b "../genome-features/Olurida_v081-20190709.gene.gff" "../genome-features/Olurida_v081-20190709.exon.gff" "../genome-features/Olurida_v081-20190709.CDS.gff" "../genome-features/Olurida_v081-20190709.mRNA.gff" "../genome-features/Olurida_v081_TE-Cg.gff" "../genome-features/20190709-Olurida_v081.stringtie.gtf" | wc -l
echo "Total methylated loci:"
cat ../analyses/macau/macau-all-loci.bed | wc -l
echo "Loci associated with shell length (MACAU):"
cat "../analyses/macau/macau.sign.length.perc.meth.bed" | wc -l
echo "Loci that overlap with genes:"
bedtools intersect -u -a "../analyses/macau/macau.sign.length.perc.meth.bed" -b "../genome-features/Olurida_v081-20190709.gene.gff" | wc -l
echo "Loci that overlap with gene regions (+/- 2kb):"
bedtools intersect -u -a "../analyses/macau/macau.sign.length.perc.meth.bed" -b "../genome-features/Olurida_v081-20190709.gene.2kbslop.gff" | wc -l
echo "Loci that overlap with exons:"
bedtools intersect -u -a "../analyses/macau/macau.sign.length.perc.meth.bed" -b "../genome-features/Olurida_v081-20190709.exon.gff" | wc -l
echo "Loci that overlap with coding sequences:"
bedtools intersect -u -a "../analyses/macau/macau.sign.length.perc.meth.bed" -b "../genome-features/Olurida_v081-20190709.CDS.gff" | wc -l
echo "Loci that overlap with mRNA:"
bedtools intersect -u -a "../analyses/macau/macau.sign.length.perc.meth.bed" -b "../genome-features/Olurida_v081-20190709.mRNA.gff" | wc -l
echo "Loci that overlap with transposable elements:"
bedtools intersect -u -a "../analyses/macau/macau.sign.length.perc.meth.bed" -b "../genome-features/Olurida_v081_TE-Cg.gff" | wc -l
echo "Loci that overlap with alternative splice variants:"
bedtools intersect -u -a "../analyses/macau/macau.sign.length.perc.meth.bed" -b "../genome-features/20190709-Olurida_v081.stringtie.gtf" | wc -l
echo "Loci that do not overlap with known features:"
bedtools intersect -v -a "../analyses/macau/macau.sign.length.perc.meth.bed" -b "../genome-features/Olurida_v081-20190709.gene.gff" "../genome-features/Olurida_v081-20190709.exon.gff" "../genome-features/Olurida_v081-20190709.CDS.gff" "../genome-features/Olurida_v081-20190709.mRNA.gff" "../genome-features/Olurida_v081_TE-Cg.gff" "../genome-features/20190709-Olurida_v081.stringtie.gtf" | wc -l