This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
library(radiator)
library(adegenet)
Loading required package: ade4
/// adegenet 2.1.2 is loaded ////////////
> overview: '?adegenet'
> tutorials/doc/questions: 'adegenetWeb()'
> bug reports/feature requests: adegenetIssues()
library(PCAviz)
#PCA of all samples uses VCF
##Convert VCF to genind
data <- read_vcf("../data/HCSS_Afilt32m70_01_pp90.vcf",strata = "../../SSHCfiltNreps.strata",parallel.core = 1,vcf.stats = FALSE, filter.monomorphic=FALSE)
Execution date@time: 20200707@1623
Folder created: read_vcf_20200707@1623
Function call and arguments stored in: radiator_read_vcf_args_20200707@1623.tsv
File written: random.seed (299848)
Reading VCF
Data summary:
number of samples: 114
number of markers: 5269
done! timing: 1 sec
File written: radiator_20200707@1624.gds
Analyzing the data...
VCF source: unknown
Data is bi-allelic
Synchronizing data and strata...
Number of strata: 2
Number of individuals: 114
Reads assembly: de novo
Filters parameters file generated: filters_parameters_20200707@1623.tsv
Filters parameters file: initiated and updated
################################################################################
####################### radiator::filter_common_markers ########################
################################################################################
Execution date@time: 20200707@1624
Function call and arguments stored in: radiator_filter_common_markers_args_20200707@1624.tsv
Filters parameters file: initiated
Scanning for common markers...
Generating UpSet plot to visualize markers in common
File written: whitelist.common.markers_20200707@1624.tsv
Filters parameters file: updated
################################### RESULTS ####################################
Filter common markers:
Number of individuals / strata / chrom / locus / SNP:
Before: 114 / 2 / 1 / 3408 / 5269
Blacklisted: 0 / 0 / 0 / 0 / 0
After: 114 / 2 / 1 / 3408 / 5269
Computation time, overall: 1 sec
####################### completed filter_common_markers ########################
Preparing output files...
File written: whitelist.markers.tsv
Writing the filtered strata: strata.filtered.tsvstrata.filtered.tsv
################################### SUMMARY ####################################
Number of chromosome/contig/scaffold: 1
Number of locus: 3408
Number of markers: 5269
Number of populations: 2
Number of individuals: 114
Number of ind/pop:
HC = 57
SS = 57
Number of duplicate id: 0
radiator Genomic Data Structure (GDS) file: radiator_20200707@1624.gds
Computation time, overall: 12 sec
############################## completed read_vcf ##############################
gen <- genomic_converter(data,strata = "../../SSHCfiltNreps.strata",output = "genind", filter.monomorphic=F)
################################################################################
######################### radiator::genomic_converter ##########################
################################################################################
Execution date@time: 20200707@1624
Folder created: 13_radiator_genomic_converter_20200707@1624
Function call and arguments stored in: radiator_genomic_converter_args_20200707@1624.tsv
Filters parameters file generated: filters_parameters_20200707@1624.tsv
Importing data
Synchronizing data and strata...
Number of strata: 2
Number of individuals: 114
Writing tidy data set:
radiator_data_20200707@1624.rad
Computation time, overall: 9 sec
Preparing data for output
Data is bi-allelic
Generating adegenet genind object
################################### RESULTS ####################################
Data format of input: SeqVarGDSClass
Biallelic data
Number of markers: 5269
Number of chromosome/contig/scaffold: 1
Number of strata: 2
Number of individuals: 114
Computation time, overall: 14 sec
######################### completed genomic_converter ##########################
##Run PCA
HCSSall <- gen$genind
HCSSall
/// GENIND OBJECT /////////
// 114 individuals; 5,269 loci; 10,538 alleles; size: 7.8 Mb
// Basic content
@tab: 114 x 10538 matrix of allele counts
@loc.n.all: number of alleles per locus (range: 2-2)
@loc.fac: locus factor for the 10538 columns of @tab
@all.names: list of allele names for each locus
@ploidy: ploidy of each individual (range: 2-2)
@type: codom
@call: radiator::write_genind(data = input, write = TRUE)
// Optional content
@pop: population of each individual (group size range: 57-57)
@strata: a data frame with 2 columns ( POP_ID, INDIVIDUALS )
save(HCSSall,file="../analyses/2bRAD/HCSS_Afilt32m70_01_pp90.genind")
geno <- tab(HCSSall)
geno <- geno[,seq(1,ncol(geno)-1,2)]
head(geno)[,c(1:10)]
1__Contig0__109103.A1__C 1__Contig0__109119.A1__T 1__Contig0__109120.A1__G
HC1-11 1 1 2
HC1-12 2 2 2
HC1-13 0 0 2
HC1-14-L5 2 2 2
HC1-15 1 1 2
HC1-18 1 2 NA
1__Contig0__109123.A1__T 1__Contig100040__508.A1__G 1__Contig100040__510.A1__T
HC1-11 1 2 2
HC1-12 2 2 1
HC1-13 0 0 2
HC1-14-L5 2 2 1
HC1-15 1 2 2
HC1-18 2 0 2
1__Contig100040__516.A1__A 1__Contig100040__524.A1__T 1__Contig100415__3978.A1__C
HC1-11 2 2 0
HC1-12 2 1 2
HC1-13 2 2 1
HC1-14-L5 2 2 1
HC1-15 2 2 0
HC1-18 2 2 1
1__Contig100504__842.A1__T
HC1-11 1
HC1-12 2
HC1-13 2
HC1-14-L5 2
HC1-15 2
HC1-18 2
write.table(geno,file="../analyses/2bRAD/HCSS_Afilt32m70_01_pp90.geno",quote = F)
NA.afDraw<- function(ind){
ind.mat <- ind@tab
new.mat <- ind.mat
af = colSums(ind.mat[,seq(1,ncol(ind.mat)-1,2)],na.rm = TRUE)/
(2*apply(ind.mat[,seq(1,ncol(ind.mat)-1,2)],2,function(x) sum(!is.na(x))))
af.Draw <- function(geno, af){
new <- function(geno,af){
if(is.na(geno)){
newA = rbinom(1,2,af)
}
else {newA <- geno}
return(newA)
}
new.row <- mapply(geno,af,FUN = new)
return(new.row)}
new.mat[,seq(1,ncol(ind.mat)-1,2)] <- t(apply(ind.mat[,seq(1,ncol(ind.mat)-1,2)],1,af.Draw,af))
new.mat[,seq(2,ncol(ind.mat),2)] <- 2-new.mat[,seq(1,ncol(ind.mat)-1,2)]
new.ind <- ind
new.ind@tab <- new.mat
return(new.ind)
}
u.na <- NA.afDraw(gen$genind)
pca <- dudi.pca(u.na,cent=TRUE,scale=TRUE,scannf = T)
Select the number of axes:
10
pca <- dudi.pca(u.na,cent=TRUE,scale=TRUE,scannf = F, nf = 10)
col6 <- c("red","blue")
par(mfrow=c(2,2))
s.class(pca$li, strata(u.na)$POP_ID,xax=1,yax=2,
sub = "HCSS_Afilt32m70_01_pp90, PC1-2, 114 inds, 5269 SNPs ",
possub = "topleft",col=transp(col6,.6),axesell=FALSE,
cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
s.class(pca$li, strata(u.na)$POP_ID,xax=1,yax=3,
sub = "HCSS_Afilt32m70_01_pp90, PC1-3, 114 inds, 5269 SNPs ",
possub = "topleft",col=transp(col6,.6),axesell=FALSE,
cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
s.class(pca$li, strata(u.na)$POP_ID,xax=1,yax=4,
sub = "HCSS_Afilt32m70_01_pp90, PC1-4, 114 inds, 5269 SNPs ",
possub = "topleft",col=transp(col6,.6),axesell=FALSE,
cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
s.class(pca$li, strata(u.na)$POP_ID,xax=1,yax=5,
sub = "HCSS_Afilt32m70_01_pp90, PC1-5, 114 inds, 5269 SNPs ",
possub = "topleft",col=transp(col6,.6),axesell=FALSE,
cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
eig.perc <- 100*pca$eig/sum(pca$eig)
head(eig.perc)
[1] 6.651406 5.064544 4.572244 4.176433 3.049277 2.982483
Saving PCA
write.table(pca$li,file = "../analyses/2bRAD/PCA_allHCSS.tab",sep="\t",quote = F)
save(pca, file = "../analyses/2bRAD/PCA_allHCSS.Robj")
##Get top loading SNPs
library(factoextra)
PCA.var.loadings <- get_pca_var(pca)
#data.frame(PCA.var.loadings$contrib) # % that each variable (locus) contributes to PCA dimensions. We're interested in Dim1 and Dim2.
# Subset variable loadings for those that contribute >5% to PC axes
loci <- rownames(subset(data.frame(PCA.var.loadings$contrib), Dim.1>0.05)) #axis 1
loci = sapply(strsplit(loci,"\\."), '[',1)
loci <- unique(loci)
length(loci)
[1] 110
contig <- sapply(strsplit(loci,"__"), '[',2)
pos <- sapply(strsplit(loci,"__"), '[',3)
positions <- cbind(contig,pos)
head(positions)
contig pos
[1,] "Contig101655" "12681"
[2,] "Contig104078" "683"
[3,] "Contig107120" "3263"
[4,] "Contig109783" "670"
[5,] "Contig120378" "211"
[6,] "Contig121049" "2908"
# read in fst per site
fst= read.csv("../analyses/2bRAD/HCSS_sfsm70_PerSiteFst.csv")
head(fst)
contig pos a b fst
1 Contig0 74432 0.000 0.009 0.00000000
2 Contig0 109103 0.034 0.457 0.07423581
3 Contig0 109119 0.003 0.052 0.05660377
4 Contig0 109123 0.002 0.067 0.02941176
5 Contig1 40362 0.000 0.009 0.00000000
6 Contig1 42880 0.003 0.078 0.03797468
# get fst for the top 5% of PC1 loadings, and save
pc1fst <- merge(positions,fst, all.x=T)[,c('contig','pos','fst')]
invalid factor level, NA generated
head(pc1fst)
contig pos fst
1 Contig101655 12681 0.1790393
2 Contig104078 683 0.2639138
3 Contig107120 3263 0.5111441
4 Contig109783 670 0.7590206
5 Contig120378 211 0.3101695
6 Contig121049 2908 0.3650255
write.table(pc1fst,"../analyses/2bRAD/Fst_PCA_allHCSS_95.tab",sep="\t",quote = F,row.names = F)
##old way
li <-pca$li
c1 <- pca$c1
colnames(c1) <- colnames(li)
#create pcaviz object
pviz <- pcaviz(x=li,rotation=c1,dat=u.na$strata)
pcaviz_loadingsplot(pviz,min.rank = 0.95)
PC1snps = rownames(pca$c1[which(abs(pca$c1$CS1) > quantile(abs(pca$c1$CS1),0.95)),])
length(PC1snps)
[1] 527
head(PC1snps)
[1] "1__Contig101655__12681.A1__A" "1__Contig101655__12681.A2__T" "1__Contig101935__6468.A1__G"
[4] "1__Contig101935__6468.A2__A" "1__Contig103053__1134.A1__C" "1__Contig103053__1134.A2__T"
loci = sapply(strsplit(PC1snps,"\\."), '[',1)
loci <- unique(loci)
contig <- sapply(strsplit(loci,"__"), '[',2)
pos <- sapply(strsplit(loci,"__"), '[',3)
positions <- cbind(contig,pos)
head(positions)
contig pos
[1,] "Contig101655" "12681"
[2,] "Contig101935" "6468"
[3,] "Contig103053" "1134"
[4,] "Contig103053" "1136"
[5,] "Contig103346" "16201"
[6,] "Contig104078" "683"
fst= read.csv("../analyses/2bRAD/HCSS_sfsm70_PerSiteFst.csv")
head(fst)
contig pos a b fst
1 Contig0 74432 0.000 0.009 0.00000000
2 Contig0 109103 0.034 0.457 0.07423581
3 Contig0 109119 0.003 0.052 0.05660377
4 Contig0 109123 0.002 0.067 0.02941176
5 Contig1 40362 0.000 0.009 0.00000000
6 Contig1 42880 0.003 0.078 0.03797468
# get fst for the top 5% of PC1 loadings, and save
pc1fst <- merge(positions,fst, all.x=T)[,c('contig','pos','fst')]
invalid factor level, NA generated
head(pc1fst)
contig pos fst
1 Contig101655 12681 0.1790393
2 Contig101935 6468 0.3543046
3 Contig103053 1134 0.2765957
4 Contig103053 1136 0.2765957
5 Contig103346 16201 0.1146667
6 Contig104078 683 0.2639138
write.table(pc1fst,"../analyses/2bRAD/Fst_PCA_allHCSS_95.tab",sep="\t",quote = F,row.names = F)
#PCA of just MBD samples
mbd = c("HC1-2B-L5","HC1-4","HC2-15A-L5","HC2-17B","HC3-1","HC3-5-L5B","HC3-7","HC3-10","HC3-11",
"SS2-9-L5","SS2-14-L5","SS2-18-L5","SS3-3","SS3-14","SS3-15","SS3-16","SS3-20","SS5-18")
genmdb <- u.na[i=mbd,drop=TRUE]
genmdb <- genmdb[loc=isPoly(genmdb)]
@tab does not contain integers; as of adegenet_2.0-0, numeric values are no longer used
genmdb
/// GENIND OBJECT /////////
// 18 individuals; 4,373 loci; 8,746 alleles; size: 3.9 Mb
// Basic content
@tab: 18 x 8746 matrix of allele counts
@loc.n.all: number of alleles per locus (range: 2-2)
@loc.fac: locus factor for the 8746 columns of @tab
@all.names: list of allele names for each locus
@ploidy: ploidy of each individual (range: 2-2)
@type: codom
@call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
// Optional content
@pop: population of each individual (group size range: 9-9)
@strata: a data frame with 2 columns ( POP_ID, INDIVIDUALS )
mbd = c("HC1-2B-L5","HC1-4","HC2-15A-L5","HC2-17B","HC3-1","HC3-5-L5B","HC3-7","HC3-10","HC3-11",
"SS2-9-L5","SS2-14-L5","SS2-18-L5","SS3-3","SS3-14","SS3-15","SS3-16","SS3-20","SS5-18")
HCSS <- HCSSall[i=mbd, drop=TRUE]
HCSS <- HCSS[loc=isPoly(HCSS)]
HCSS
/// GENIND OBJECT /////////
// 18 individuals; 4,342 loci; 8,684 alleles; size: 3.2 Mb
// Basic content
@tab: 18 x 8684 matrix of allele counts
@loc.n.all: number of alleles per locus (range: 2-2)
@loc.fac: locus factor for the 8684 columns of @tab
@all.names: list of allele names for each locus
@ploidy: ploidy of each individual (range: 2-2)
@type: codom
@call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
// Optional content
@pop: population of each individual (group size range: 9-9)
@strata: a data frame with 2 columns ( POP_ID, INDIVIDUALS )
genoM <- tab(HCSS)
genoM <- genoM[,seq(1,ncol(genoM)-1,2)]
head(genoM)[,c(1:10)]
1__Contig0__109103.A1__C 1__Contig0__109119.A1__T 1__Contig0__109123.A1__T
HC1-2B-L5 1 2 2
HC1-4 1 2 2
HC2-15A-L5 2 2 2
HC2-17B 1 2 2
HC3-1 0 0 0
HC3-5-L5B NA 2 2
1__Contig100040__508.A1__G 1__Contig100040__516.A1__A 1__Contig100040__524.A1__T
HC1-2B-L5 NA NA NA
HC1-4 1 1 2
HC2-15A-L5 1 2 1
HC2-17B 1 2 2
HC3-1 0 2 2
HC3-5-L5B 2 2 0
1__Contig100415__3978.A1__C 1__Contig100504__842.A1__T 1__Contig100504__863.A1__A
HC1-2B-L5 2 2 2
HC1-4 1 2 2
HC2-15A-L5 2 2 2
HC2-17B 1 2 2
HC3-1 2 2 2
HC3-5-L5B 2 2 2
1__Contig100572__10193.A1__G
HC1-2B-L5 NA
HC1-4 NA
HC2-15A-L5 1
HC2-17B 2
HC3-1 1
HC3-5-L5B 1
write.table(genoM,file="../analyses/2bRAD/MBD_HCSS_Afilt32m70_01_pp90.geno",quote = F)
# get geno with maf > 0.05 ( excludes singletons)
x <- HCSS[loc=which(minorAllele(HCSS)> 0.05)]
x
/// GENIND OBJECT /////////
// 18 individuals; 3,547 loci; 7,094 alleles; size: 2.7 Mb
// Basic content
@tab: 18 x 7094 matrix of allele counts
@loc.n.all: number of alleles per locus (range: 2-2)
@loc.fac: locus factor for the 7094 columns of @tab
@all.names: list of allele names for each locus
@ploidy: ploidy of each individual (range: 2-2)
@type: codom
@call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
// Optional content
@pop: population of each individual (group size range: 9-9)
@strata: a data frame with 2 columns ( POP_ID, INDIVIDUALS )
xM <- tab(x)
xM <- xM[,seq(1,ncol(xM)-1,2)]
write.table(xM,file="../analyses/2bRAD/MBD_HCSS_Afilt32m70_01_pp90_maf05.geno",quote = F)
#pca <- dudi.pca(genmdb,cent=TRUE,scale=TRUE,scannf = T)
pcaM <- dudi.pca(genmdb,cent=TRUE,scale=TRUE,scannf = F, nf = 10)
col6 <- c("red","blue")
par(mfrow=c(2,2))
s.class(pcaM$li, strata(genmdb)$POP_ID,xax=1,yax=2,
sub = "HCSS_Afilt32m70_01_pp90 MBD, PC1-2, 18 inds, 4373 SNPs ",
possub = "topleft",col=transp(col6,.6),axesell=FALSE,
cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
s.class(pcaM$li, strata(genmdb)$POP_ID,xax=1,yax=3,
sub = "HCSS_Afilt32m70_01_pp90 MBD, PC1-3, 18 inds, 4373 SNPs ",
possub = "topleft",col=transp(col6,.6),axesell=FALSE,
cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
s.class(pcaM$li, strata(genmdb)$POP_ID,xax=1,yax=4,
sub = "HCSS_Afilt32m70_01_pp90 MBD, PC1-4, 18 inds, 4373 SNPs ",
possub = "topleft",col=transp(col6,.6),axesell=FALSE,
cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
s.class(pcaM$li, strata(genmdb)$POP_ID,xax=1,yax=5,
sub = "HCSS_Afilt32m70_01_pp90 MBD, PC1-5, 18 inds, 4373 SNPs ",
possub = "topleft",col=transp(col6,.6),axesell=FALSE,
cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
eig.perc <- 100*pcaM$eig/sum(pcaM$eig)
head(eig.perc)
[1] 12.317614 8.988271 7.696656 7.301543 6.422120 6.262430
Saving PCA
write.table(pcaM$li,file = "../analyses/2bRAD/PCA_MBD.tab",sep="\t",quote = F)
save(pcaM, file = "../analyses/2bRAD/PCA_MBD.Robj")
##Get top loading SNPs
library(factoextra)
PCA.var.loadings <- get_pca_var(pcaM)
#data.frame(PCA.var.loadings$contrib) # % that each variable (locus) contributes to PCA dimensions. We're interested in Dim1 and Dim2.
# Subset variable loadings for those that contribute >5% to PC axes
loci <- rownames(subset(data.frame(PCA.var.loadings$contrib), Dim.1>0.05)) #axis 1
loci = sapply(strsplit(loci,"\\."), '[',1)
loci <- unique(loci)
length(loci)
[1] 105
contig <- sapply(strsplit(loci,"__"), '[',2)
pos <- sapply(strsplit(loci,"__"), '[',3)
positions <- cbind(contig,pos)
head(positions)
contig pos
[1,] "Contig107120" "3263"
[2,] "Contig109783" "670"
[3,] "Contig109827" "6152"
[4,] "Contig112828" "139"
[5,] "Contig117256" "646"
[6,] "Contig117622" "6364"
# get fst for the top 5% of PC1 loadings, and save
pc1fstM <- merge(positions,fst, all.x=T)[,c('contig','pos','fst')]
invalid factor level, NA generated
head(pc1fstM)
contig pos fst
1 Contig107120 3263 0.5111441
2 Contig109783 670 0.7590206
3 Contig109827 6152 0.1503132
4 Contig112828 139 0.2288288
5 Contig117256 646 0.1486989
6 Contig117622 6364 0.2860963
write.table(pc1fstM,"../analyses/2bRAD/Fst_PCA_MBD_95.tab",sep="\t",quote = F,row.names = F)
###Old, gets top 5% quantile
li <-pcaM$li
c1 <- pcaM$c1
colnames(c1) <- colnames(li)
#create pcaviz object
pvizM <- pcaviz(x=li,rotation=c1,dat=genmdb$strata)
pcaviz_loadingsplot(pvizM,min.rank = 0.95)
PC1snpsM = rownames(pcaM$c1[which(abs(pcaM$c1$CS1) > quantile(abs(pcaM$c1$CS1),0.95)),])
length(PC1snpsM)
[1] 438
head(PC1snpsM)
[1] "1__Contig101335__1205.A1__C" "1__Contig101335__1205.A2__T" "1__Contig101935__6468.A1__G"
[4] "1__Contig101935__6468.A2__A" "1__Contig102417__2171.A1__G" "1__Contig102417__2171.A2__T"
lociM = sapply(strsplit(PC1snpsM,"\\."), '[',1)
lociM <- unique(lociM)
contig <- sapply(strsplit(lociM,"__"), '[',2)
pos <- sapply(strsplit(lociM,"__"), '[',3)
positionsM <- cbind(contig,pos)
head(positionsM)
contig pos
[1,] "Contig101335" "1205"
[2,] "Contig101935" "6468"
[3,] "Contig102417" "2171"
[4,] "Contig103298" "7435"
[5,] "Contig106125" "3829"
[6,] "Contig107120" "3263"
# get fst for the top 5% of PC1 loadings, and save
pc1fstM <- merge(positionsM,fst, all.x=T)[,c('contig','pos','fst')]
invalid factor level, NA generated
head(pc1fstM)
contig pos fst
1 Contig101335 1205 NA
2 Contig101935 6468 0.35430464
3 Contig102417 2171 0.07792208
4 Contig103298 7435 0.06282723
5 Contig106125 3829 0.27296588
6 Contig107120 3263 0.51114413
write.table(pc1fstM,"../analyses/2bRAD/Fst_PCA_MBD_95.tab",sep="\t",quote = F,row.names = F)
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).