This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
library(radiator)
library(adegenet)
## Loading required package: ade4
##
## /// adegenet 2.1.2 is loaded ////////////
##
## > overview: '?adegenet'
## > tutorials/doc/questions: 'adegenetWeb()'
## > bug reports/feature requests: adegenetIssues()
library(PCAviz)
#PCA of all samples uses VCF
##Convert VCF to genind
data <- read_vcf("../data/HCSS_Afilt32m70_01_pp90.vcf",strata = "../../SSHCfiltNreps.strata",parallel.core = 1,vcf.stats = FALSE, filter.monomorphic=FALSE)
## Execution date@time: 20200624@1220
## Folder created: read_vcf_20200624@1220
## Function call and arguments stored in: radiator_read_vcf_args_20200624@1220.tsv
## File written: random.seed (79092)
##
## Reading VCF
## Data summary:
## number of samples: 114
## number of markers: 5269
## done! timing: 1 sec
##
## File written: radiator_20200624@1220.gds
##
## Analyzing the data...
## VCF source: unknown
## Data is bi-allelic
## Synchronizing data and strata...
## Number of strata: 2
## Number of individuals: 114
## Reads assembly: de novo
## Filters parameters file generated: filters_parameters_20200624@1220.tsv
## Filters parameters file: initiated and updated
## ################################################################################
## ####################### radiator::filter_common_markers ########################
## ################################################################################
## Execution date@time: 20200624@1220
## Function call and arguments stored in: radiator_filter_common_markers_args_20200624@1220.tsv
## Filters parameters file: initiated
## Scanning for common markers...
## Generating UpSet plot to visualize markers in common
## File written: whitelist.common.markers_20200624@1220.tsv
## Filters parameters file: updated
## ################################### RESULTS ####################################
##
## Filter common markers:
## Number of individuals / strata / chrom / locus / SNP:
## Before: 114 / 2 / 1 / 3408 / 5269
## Blacklisted: 0 / 0 / 0 / 0 / 0
## After: 114 / 2 / 1 / 3408 / 5269
##
## Computation time, overall: 1 sec
## ####################### completed filter_common_markers ########################
##
## Preparing output files...
## File written: whitelist.markers.tsv
## Writing the filtered strata: strata.filtered.tsvstrata.filtered.tsv
## ################################### SUMMARY ####################################
##
##
## Number of chromosome/contig/scaffold: 1
## Number of locus: 3408
## Number of markers: 5269
## Number of populations: 2
## Number of individuals: 114
##
## Number of ind/pop:
## HC = 57
## SS = 57
##
## Number of duplicate id: 0
## radiator Genomic Data Structure (GDS) file: radiator_20200624@1220.gds
##
## Computation time, overall: 5 sec
## ############################## completed read_vcf ##############################
gen <- genomic_converter(data,strata = "../../SSHCfiltNreps.strata",output = "genind", filter.monomorphic=F)
## ################################################################################
## ######################### radiator::genomic_converter ##########################
## ################################################################################
## Execution date@time: 20200624@1220
## Folder created: 08_radiator_genomic_converter_20200624@1220
## Function call and arguments stored in: radiator_genomic_converter_args_20200624@1220.tsv
## Filters parameters file generated: filters_parameters_20200624@1220.tsv
##
## Importing data
## Synchronizing data and strata...
## Number of strata: 2
## Number of individuals: 114
##
## Writing tidy data set:
## radiator_data_20200624@1220.rad
##
## Computation time, overall: 7 sec
##
## Preparing data for output
## Data is bi-allelic
## Generating adegenet genind object
## ################################### RESULTS ####################################
## Data format of input: SeqVarGDSClass
## Biallelic data
## Number of markers: 5269
## Number of chromosome/contig/scaffold: 1
## Number of strata: 2
## Number of individuals: 114
##
## Computation time, overall: 10 sec
## ######################### completed genomic_converter ##########################
##Run PCA
gen$genind
## /// GENIND OBJECT /////////
##
## // 114 individuals; 5,269 loci; 10,538 alleles; size: 7.8 Mb
##
## // Basic content
## @tab: 114 x 10538 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 2-2)
## @loc.fac: locus factor for the 10538 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: radiator::write_genind(data = input, write = TRUE)
##
## // Optional content
## @pop: population of each individual (group size range: 57-57)
## @strata: a data frame with 2 columns ( POP_ID, INDIVIDUALS )
NA.afDraw<- function(ind){
ind.mat <- ind@tab
new.mat <- ind.mat
af = colSums(ind.mat[,seq(1,ncol(ind.mat)-1,2)],na.rm = TRUE)/
(2*apply(ind.mat[,seq(1,ncol(ind.mat)-1,2)],2,function(x) sum(!is.na(x))))
af.Draw <- function(geno, af){
new <- function(geno,af){
if(is.na(geno)){
newA = rbinom(1,2,af)
}
else {newA <- geno}
return(newA)
}
new.row <- mapply(geno,af,FUN = new)
return(new.row)}
new.mat[,seq(1,ncol(ind.mat)-1,2)] <- t(apply(ind.mat[,seq(1,ncol(ind.mat)-1,2)],1,af.Draw,af))
new.mat[,seq(2,ncol(ind.mat),2)] <- 2-new.mat[,seq(1,ncol(ind.mat)-1,2)]
new.ind <- ind
new.ind@tab <- new.mat
return(new.ind)
}
u.na <- NA.afDraw(gen$genind)
#pca <- dudi.pca(u.na,cent=TRUE,scale=TRUE,scannf = T)
pca <- dudi.pca(u.na,cent=TRUE,scale=TRUE,scannf = F, nf = 10)
col6 <- c("red","blue")
par(mfrow=c(2,2))
s.class(pca$li, strata(u.na)$POP_ID,xax=1,yax=2,
sub = "HCSS_Afilt32m70_01_pp90, PC1-2, 114 inds, 5269 SNPs ",
possub = "topleft",col=transp(col6,.6),axesell=FALSE,
cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
s.class(pca$li, strata(u.na)$POP_ID,xax=1,yax=3,
sub = "HCSS_Afilt32m70_01_pp90, PC1-3, 114 inds, 5269 SNPs ",
possub = "topleft",col=transp(col6,.6),axesell=FALSE,
cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
s.class(pca$li, strata(u.na)$POP_ID,xax=1,yax=4,
sub = "HCSS_Afilt32m70_01_pp90, PC1-4, 114 inds, 5269 SNPs ",
possub = "topleft",col=transp(col6,.6),axesell=FALSE,
cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
s.class(pca$li, strata(u.na)$POP_ID,xax=1,yax=5,
sub = "HCSS_Afilt32m70_01_pp90, PC1-5, 114 inds, 5269 SNPs ",
possub = "topleft",col=transp(col6,.6),axesell=FALSE,
cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
eig.perc <- 100*pca$eig/sum(pca$eig)
head(eig.perc)
## [1] 6.644372 5.073012 4.562529 4.208986 3.060878 2.963567
Saving PCA
write.table(pca$li,file = "../analyses/2bRAD/PCA_allHCSS.tab",sep="\t",quote = F)
save(pca, file = "../analyses/2bRAD/PCA_allHCSS.Robj")
##Get top loading SNPs
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
PCA.var.loadings <- get_pca_var(pca)
#data.frame(PCA.var.loadings$contrib) # % that each variable (locus) contributes to PCA dimensions. We're interested in Dim1 and Dim2.
# Subset variable loadings for those that contribute >5% to PC axes
loci <- rownames(subset(data.frame(PCA.var.loadings$contrib), Dim.1>0.05)) #axis 1
loci = sapply(strsplit(loci,"\\."), '[',1)
loci <- unique(loci)
length(loci)
## [1] 107
contig <- sapply(strsplit(loci,"__"), '[',2)
pos <- sapply(strsplit(loci,"__"), '[',3)
positions <- cbind(contig,pos)
head(positions)
## contig pos
## [1,] "Contig101655" "12681"
## [2,] "Contig104078" "683"
## [3,] "Contig107120" "3263"
## [4,] "Contig109783" "670"
## [5,] "Contig121049" "2908"
## [6,] "Contig122227" "2742"
# read in fst per site
fst= read.csv("../analyses/2bRAD/HCSS_sfsm70_PerSiteFst.csv")
head(fst)
## contig pos a b fst
## 1 Contig0 74432 0.000 0.009 0.00000000
## 2 Contig0 109103 0.034 0.457 0.07423581
## 3 Contig0 109119 0.003 0.052 0.05660377
## 4 Contig0 109123 0.002 0.067 0.02941176
## 5 Contig1 40362 0.000 0.009 0.00000000
## 6 Contig1 42880 0.003 0.078 0.03797468
# get fst for the top 5% of PC1 loadings, and save
pc1fst <- merge(positions,fst, all.x=T)[,c('contig','pos','fst')]
## Warning in `[<-.factor`(`*tmp*`, ri, value = c(74432L, 109103L, 109119L, :
## invalid factor level, NA generated
head(pc1fst)
## contig pos fst
## 1 Contig101655 12681 0.1790393
## 2 Contig104078 683 0.2639138
## 3 Contig107120 3263 0.5111441
## 4 Contig109783 670 0.7590206
## 5 Contig121049 2908 0.3650255
## 6 Contig122227 2742 0.1706349
write.table(pc1fst,"../analyses/2bRAD/Fst_PCA_allHCSS_95.tab",sep="\t",quote = F,row.names = F)
##old way
li <-pca$li
c1 <- pca$c1
colnames(c1) <- colnames(li)
#create pcaviz object
pviz <- pcaviz(x=li,rotation=c1,dat=u.na$strata)
pcaviz_loadingsplot(pviz,min.rank = 0.95)
PC1snps = rownames(pca$c1[which(abs(pca$c1$CS1) > quantile(abs(pca$c1$CS1),0.95)),])
length(PC1snps)
## [1] 527
head(PC1snps)
## [1] "1__Contig101655__12681.A1__A" "1__Contig101655__12681.A2__T"
## [3] "1__Contig101935__6468.A1__G" "1__Contig101935__6468.A2__A"
## [5] "1__Contig103053__1134.A1__C" "1__Contig103053__1134.A2__T"
loci = sapply(strsplit(PC1snps,"\\."), '[',1)
loci <- unique(loci)
contig <- sapply(strsplit(loci,"__"), '[',2)
pos <- sapply(strsplit(loci,"__"), '[',3)
positions <- cbind(contig,pos)
head(positions)
## contig pos
## [1,] "Contig101655" "12681"
## [2,] "Contig101935" "6468"
## [3,] "Contig103053" "1134"
## [4,] "Contig103053" "1136"
## [5,] "Contig103346" "16201"
## [6,] "Contig104078" "683"
# read in fst per site
fst= read.csv("../analyses/2bRAD/HCSS_sfsm70_PerSiteFst.csv")
head(fst)
## contig pos a b fst
## 1 Contig0 74432 0.000 0.009 0.00000000
## 2 Contig0 109103 0.034 0.457 0.07423581
## 3 Contig0 109119 0.003 0.052 0.05660377
## 4 Contig0 109123 0.002 0.067 0.02941176
## 5 Contig1 40362 0.000 0.009 0.00000000
## 6 Contig1 42880 0.003 0.078 0.03797468
# get fst for the top 5% of PC1 loadings, and save
pc1fst <- merge(positions,fst, all.x=T)[,c('contig','pos','fst')]
## Warning in `[<-.factor`(`*tmp*`, ri, value = c(74432L, 109103L, 109119L, :
## invalid factor level, NA generated
head(pc1fst)
## contig pos fst
## 1 Contig101655 12681 0.1790393
## 2 Contig101935 6468 0.3543046
## 3 Contig103053 1134 0.2765957
## 4 Contig103053 1136 0.2765957
## 5 Contig103346 16201 0.1146667
## 6 Contig104078 683 0.2639138
write.table(pc1fst,"../analyses/2bRAD/Fst_PCA_allHCSS_95.tab",sep="\t",quote = F,row.names = F)
#PCA of just MBD samples
mbd = c("HC1-2B-L5","HC1-4","HC2-15A-L5","HC2-17B","HC3-1","HC3-5-L5B","HC3-7","HC3-10","HC3-11",
"SS2-9-L5","SS2-14-L5","SS2-18-L5","SS3-3","SS3-14","SS3-15","SS3-16","SS3-20","SS5-18")
genmdb <- u.na[i=mbd,drop=TRUE]
genmdb <- genmdb[loc=isPoly(genmdb)]
## Warning in validityMethod(object): @tab does not contain integers; as of
## adegenet_2.0-0, numeric values are no longer used
genmdb
## /// GENIND OBJECT /////////
##
## // 18 individuals; 4,382 loci; 8,764 alleles; size: 3.9 Mb
##
## // Basic content
## @tab: 18 x 8764 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 2-2)
## @loc.fac: locus factor for the 8764 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
##
## // Optional content
## @pop: population of each individual (group size range: 9-9)
## @strata: a data frame with 2 columns ( POP_ID, INDIVIDUALS )
#pca <- dudi.pca(genmdb,cent=TRUE,scale=TRUE,scannf = T)
pcaM <- dudi.pca(genmdb,cent=TRUE,scale=TRUE,scannf = F, nf = 10)
col6 <- c("red","blue")
par(mfrow=c(2,2))
s.class(pcaM$li, strata(genmdb)$POP_ID,xax=1,yax=2,
sub = "HCSS_Afilt32m70_01_pp90 MBD, PC1-2, 18 inds, 4373 SNPs ",
possub = "topleft",col=transp(col6,.6),axesell=FALSE,
cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
s.class(pcaM$li, strata(genmdb)$POP_ID,xax=1,yax=3,
sub = "HCSS_Afilt32m70_01_pp90 MBD, PC1-3, 18 inds, 4373 SNPs ",
possub = "topleft",col=transp(col6,.6),axesell=FALSE,
cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
s.class(pcaM$li, strata(genmdb)$POP_ID,xax=1,yax=4,
sub = "HCSS_Afilt32m70_01_pp90 MBD, PC1-4, 18 inds, 4373 SNPs ",
possub = "topleft",col=transp(col6,.6),axesell=FALSE,
cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
s.class(pcaM$li, strata(genmdb)$POP_ID,xax=1,yax=5,
sub = "HCSS_Afilt32m70_01_pp90 MBD, PC1-5, 18 inds, 4373 SNPs ",
possub = "topleft",col=transp(col6,.6),axesell=FALSE,
cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
eig.perc <- 100*pcaM$eig/sum(pcaM$eig)
head(eig.perc)
## [1] 12.365344 9.051154 7.673460 7.288396 6.416726 6.216463
Saving PCA
write.table(pcaM$li,file = "../analyses/2bRAD/PCA_MBD.tab",sep="\t",quote = F)
save(pcaM, file = "../analyses/2bRAD/PCA_MBD.Robj")
##Get top loading SNPs
library(factoextra)
PCA.var.loadings <- get_pca_var(pcaM)
#data.frame(PCA.var.loadings$contrib) # % that each variable (locus) contributes to PCA dimensions. We're interested in Dim1 and Dim2.
# Subset variable loadings for those that contribute >5% to PC axes
loci <- rownames(subset(data.frame(PCA.var.loadings$contrib), Dim.1>0.05)) #axis 1
loci = sapply(strsplit(loci,"\\."), '[',1)
loci <- unique(loci)
length(loci)
## [1] 110
contig <- sapply(strsplit(loci,"__"), '[',2)
pos <- sapply(strsplit(loci,"__"), '[',3)
positions <- cbind(contig,pos)
head(positions)
## contig pos
## [1,] "Contig101935" "6468"
## [2,] "Contig107120" "3263"
## [3,] "Contig109827" "6152"
## [4,] "Contig112828" "139"
## [5,] "Contig117256" "646"
## [6,] "Contig117622" "6364"
# get fst for the top 5% of PC1 loadings, and save
pc1fstM <- merge(positions,fst, all.x=T)[,c('contig','pos','fst')]
## Warning in `[<-.factor`(`*tmp*`, ri, value = c(74432L, 109103L, 109119L, :
## invalid factor level, NA generated
head(pc1fstM)
## contig pos fst
## 1 Contig101935 6468 0.3543046
## 2 Contig107120 3263 0.5111441
## 3 Contig109827 6152 0.1503132
## 4 Contig112828 139 0.2288288
## 5 Contig117256 646 0.1486989
## 6 Contig117622 6364 0.2860963
write.table(pc1fstM,"../analyses/2bRAD/Fst_PCA_MBD_95.tab",sep="\t",quote = F,row.names = F)
###Old, gets top 5% quantile
li <-pcaM$li
c1 <- pcaM$c1
colnames(c1) <- colnames(li)
#create pcaviz object
pvizM <- pcaviz(x=li,rotation=c1,dat=genmdb$strata)
pcaviz_loadingsplot(pvizM,min.rank = 0.95)
PC1snpsM = rownames(pcaM$c1[which(abs(pcaM$c1$CS1) > quantile(abs(pcaM$c1$CS1),0.95)),])
length(PC1snpsM)
## [1] 438
head(PC1snpsM)
## [1] "1__Contig101335__1205.A1__C" "1__Contig101335__1205.A2__T"
## [3] "1__Contig101935__6468.A1__G" "1__Contig101935__6468.A2__A"
## [5] "1__Contig103298__7435.A1__G" "1__Contig103298__7435.A2__C"
lociM = sapply(strsplit(PC1snpsM,"\\."), '[',1)
lociM <- unique(lociM)
contig <- sapply(strsplit(lociM,"__"), '[',2)
pos <- sapply(strsplit(lociM,"__"), '[',3)
positionsM <- cbind(contig,pos)
head(positionsM)
## contig pos
## [1,] "Contig101335" "1205"
## [2,] "Contig101935" "6468"
## [3,] "Contig103298" "7435"
## [4,] "Contig106125" "3829"
## [5,] "Contig107120" "3263"
## [6,] "Contig108706" "5626"
# get fst for the top 5% of PC1 loadings, and save
pc1fstM <- merge(positionsM,fst, all.x=T)[,c('contig','pos','fst')]
## Warning in `[<-.factor`(`*tmp*`, ri, value = c(74432L, 109103L, 109119L, :
## invalid factor level, NA generated
head(pc1fstM)
## contig pos fst
## 1 Contig101335 1205 NA
## 2 Contig101935 6468 0.35430464
## 3 Contig103298 7435 0.06282723
## 4 Contig106125 3829 0.27296588
## 5 Contig107120 3263 0.51114413
## 6 Contig108706 5626 0.12138728
write.table(pc1fstM,"../analyses/2bRAD/Fst_PCA_MBD_95.tab",sep="\t",quote = F,row.names = F)
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).