PCA of

This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

library(radiator)
library(adegenet)

## Loading required package: ade4

## 
##    /// adegenet 2.1.2 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()

library(PCAviz)

#PCA of all samples uses VCF

##Convert VCF to genind

data <- read_vcf("../data/HCSS_Afilt32m70_01_pp90.vcf",strata = "../../SSHCfiltNreps.strata",parallel.core = 1,vcf.stats = FALSE, filter.monomorphic=FALSE)

## Execution date@time: 20200624@1220

## Folder created: read_vcf_20200624@1220

## Function call and arguments stored in: radiator_read_vcf_args_20200624@1220.tsv

## File written: random.seed (79092)

## 
## Reading VCF

## Data summary:

##     number of samples: 114

##     number of markers: 5269

## done! timing: 1 sec

## 
## File written: radiator_20200624@1220.gds

## 
## Analyzing the data...

## VCF source: unknown

## Data is bi-allelic

## Synchronizing data and strata...

##     Number of strata: 2

##     Number of individuals: 114

## Reads assembly: de novo

## Filters parameters file generated: filters_parameters_20200624@1220.tsv

## Filters parameters file: initiated and updated

## ################################################################################
## ####################### radiator::filter_common_markers ########################
## ################################################################################

## Execution date@time: 20200624@1220

## Function call and arguments stored in: radiator_filter_common_markers_args_20200624@1220.tsv

## Filters parameters file: initiated

## Scanning for common markers...

## Generating UpSet plot to visualize markers in common

## File written: whitelist.common.markers_20200624@1220.tsv

## Filters parameters file: updated

## ################################### RESULTS ####################################

## 
## Filter common markers:

## Number of individuals / strata / chrom / locus / SNP:

##     Before: 114 / 2 / 1 / 3408 / 5269

##     Blacklisted: 0 / 0 / 0 / 0 / 0

##     After: 114 / 2 / 1 / 3408 / 5269

## 
## Computation time, overall: 1 sec

## ####################### completed filter_common_markers ########################

## 
## Preparing output files...

## File written: whitelist.markers.tsv

## Writing the filtered strata: strata.filtered.tsvstrata.filtered.tsv

## ################################### SUMMARY ####################################

## 
## 
## Number of chromosome/contig/scaffold: 1

## Number of locus: 3408

## Number of markers: 5269

## Number of populations: 2

## Number of individuals: 114

## 
## Number of ind/pop:
## HC = 57
## SS = 57

## 
## Number of duplicate id: 0

## radiator Genomic Data Structure (GDS) file: radiator_20200624@1220.gds

## 
## Computation time, overall: 5 sec

## ############################## completed read_vcf ##############################

gen <- genomic_converter(data,strata = "../../SSHCfiltNreps.strata",output = "genind", filter.monomorphic=F)

## ################################################################################
## ######################### radiator::genomic_converter ##########################
## ################################################################################

## Execution date@time: 20200624@1220

## Folder created: 08_radiator_genomic_converter_20200624@1220

## Function call and arguments stored in: radiator_genomic_converter_args_20200624@1220.tsv

## Filters parameters file generated: filters_parameters_20200624@1220.tsv

## 
## Importing data

## Synchronizing data and strata...

##     Number of strata: 2

##     Number of individuals: 114

## 
## Writing tidy data set:
## radiator_data_20200624@1220.rad

## 
## Computation time, overall: 7 sec

## 
## Preparing data for output

## Data is bi-allelic

## Generating adegenet genind object

## ################################### RESULTS ####################################

## Data format of input: SeqVarGDSClass

## Biallelic data

## Number of markers: 5269

## Number of chromosome/contig/scaffold: 1

## Number of strata: 2

## Number of individuals: 114

## 
## Computation time, overall: 10 sec

## ######################### completed genomic_converter ##########################

##Run PCA

gen$genind

## /// GENIND OBJECT /////////
## 
##  // 114 individuals; 5,269 loci; 10,538 alleles; size: 7.8 Mb
## 
##  // Basic content
##    @tab:  114 x 10538 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 2-2)
##    @loc.fac: locus factor for the 10538 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: radiator::write_genind(data = input, write = TRUE)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 57-57)
##    @strata: a data frame with 2 columns ( POP_ID, INDIVIDUALS )

NA.afDraw<- function(ind){
  ind.mat <- ind@tab
  new.mat <- ind.mat
  af = colSums(ind.mat[,seq(1,ncol(ind.mat)-1,2)],na.rm = TRUE)/
      (2*apply(ind.mat[,seq(1,ncol(ind.mat)-1,2)],2,function(x) sum(!is.na(x))))
  af.Draw <- function(geno, af){
     new <- function(geno,af){
        if(is.na(geno)){
        newA = rbinom(1,2,af)
        }
        else {newA <- geno}
        return(newA)
   }
  new.row <- mapply(geno,af,FUN = new)
  return(new.row)}
  
  new.mat[,seq(1,ncol(ind.mat)-1,2)] <- t(apply(ind.mat[,seq(1,ncol(ind.mat)-1,2)],1,af.Draw,af))
  new.mat[,seq(2,ncol(ind.mat),2)] <- 2-new.mat[,seq(1,ncol(ind.mat)-1,2)]
  new.ind <- ind
  new.ind@tab <- new.mat
  return(new.ind)
}

u.na <- NA.afDraw(gen$genind)

#pca <- dudi.pca(u.na,cent=TRUE,scale=TRUE,scannf = T)

pca <- dudi.pca(u.na,cent=TRUE,scale=TRUE,scannf = F, nf = 10)

col6 <-  c("red","blue")
par(mfrow=c(2,2))
s.class(pca$li, strata(u.na)$POP_ID,xax=1,yax=2, 
        sub = "HCSS_Afilt32m70_01_pp90, PC1-2, 114 inds, 5269 SNPs ",
        possub = "topleft",col=transp(col6,.6),axesell=FALSE,
        cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
s.class(pca$li, strata(u.na)$POP_ID,xax=1,yax=3, 
        sub = "HCSS_Afilt32m70_01_pp90, PC1-3, 114 inds, 5269 SNPs ",
        possub = "topleft",col=transp(col6,.6),axesell=FALSE,
        cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
s.class(pca$li, strata(u.na)$POP_ID,xax=1,yax=4, 
        sub = "HCSS_Afilt32m70_01_pp90, PC1-4, 114 inds, 5269 SNPs ",
        possub = "topleft",col=transp(col6,.6),axesell=FALSE,
        cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
s.class(pca$li, strata(u.na)$POP_ID,xax=1,yax=5, 
        sub = "HCSS_Afilt32m70_01_pp90, PC1-5, 114 inds, 5269 SNPs ",
        possub = "topleft",col=transp(col6,.6),axesell=FALSE,
        cstar=0, cpoint=3, grid=FALSE, cellipse = 0)

eig.perc <- 100*pca$eig/sum(pca$eig)
head(eig.perc)

## [1] 6.644372 5.073012 4.562529 4.208986 3.060878 2.963567

Saving PCA

write.table(pca$li,file = "../analyses/2bRAD/PCA_allHCSS.tab",sep="\t",quote = F)
save(pca,  file = "../analyses/2bRAD/PCA_allHCSS.Robj")

##Get top loading SNPs

library(factoextra)

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

PCA.var.loadings <- get_pca_var(pca)

#data.frame(PCA.var.loadings$contrib) # % that each variable (locus) contributes to PCA dimensions. We're interested in Dim1 and Dim2. 
# Subset variable loadings for those that contribute >5% to PC axes 
loci <- rownames(subset(data.frame(PCA.var.loadings$contrib), Dim.1>0.05)) #axis 1 
loci = sapply(strsplit(loci,"\\."), '[',1)
loci <- unique(loci)
length(loci)

## [1] 107

contig <- sapply(strsplit(loci,"__"), '[',2)
pos <- sapply(strsplit(loci,"__"), '[',3)

positions <- cbind(contig,pos)
head(positions)

##      contig         pos    
## [1,] "Contig101655" "12681"
## [2,] "Contig104078" "683"  
## [3,] "Contig107120" "3263" 
## [4,] "Contig109783" "670"  
## [5,] "Contig121049" "2908" 
## [6,] "Contig122227" "2742"

# read in fst per site
fst= read.csv("../analyses/2bRAD/HCSS_sfsm70_PerSiteFst.csv")
head(fst)

##    contig    pos     a     b        fst
## 1 Contig0  74432 0.000 0.009 0.00000000
## 2 Contig0 109103 0.034 0.457 0.07423581
## 3 Contig0 109119 0.003 0.052 0.05660377
## 4 Contig0 109123 0.002 0.067 0.02941176
## 5 Contig1  40362 0.000 0.009 0.00000000
## 6 Contig1  42880 0.003 0.078 0.03797468

# get fst for the top 5% of PC1 loadings, and save
pc1fst <- merge(positions,fst, all.x=T)[,c('contig','pos','fst')]

## Warning in `[<-.factor`(`*tmp*`, ri, value = c(74432L, 109103L, 109119L, :
## invalid factor level, NA generated

head(pc1fst)

##         contig   pos       fst
## 1 Contig101655 12681 0.1790393
## 2 Contig104078   683 0.2639138
## 3 Contig107120  3263 0.5111441
## 4 Contig109783   670 0.7590206
## 5 Contig121049  2908 0.3650255
## 6 Contig122227  2742 0.1706349

write.table(pc1fst,"../analyses/2bRAD/Fst_PCA_allHCSS_95.tab",sep="\t",quote = F,row.names = F)

##old way

li <-pca$li
c1 <- pca$c1
colnames(c1) <- colnames(li)
#create pcaviz object
pviz <- pcaviz(x=li,rotation=c1,dat=u.na$strata)

pcaviz_loadingsplot(pviz,min.rank = 0.95)

PC1snps = rownames(pca$c1[which(abs(pca$c1$CS1) > quantile(abs(pca$c1$CS1),0.95)),])
length(PC1snps)

## [1] 527

head(PC1snps)

## [1] "1__Contig101655__12681.A1__A" "1__Contig101655__12681.A2__T"
## [3] "1__Contig101935__6468.A1__G"  "1__Contig101935__6468.A2__A" 
## [5] "1__Contig103053__1134.A1__C"  "1__Contig103053__1134.A2__T"

loci = sapply(strsplit(PC1snps,"\\."), '[',1)
loci <- unique(loci)
contig <- sapply(strsplit(loci,"__"), '[',2)
pos <- sapply(strsplit(loci,"__"), '[',3)

positions <- cbind(contig,pos)
head(positions)

##      contig         pos    
## [1,] "Contig101655" "12681"
## [2,] "Contig101935" "6468" 
## [3,] "Contig103053" "1134" 
## [4,] "Contig103053" "1136" 
## [5,] "Contig103346" "16201"
## [6,] "Contig104078" "683"

# read in fst per site
fst= read.csv("../analyses/2bRAD/HCSS_sfsm70_PerSiteFst.csv")
head(fst)

##    contig    pos     a     b        fst
## 1 Contig0  74432 0.000 0.009 0.00000000
## 2 Contig0 109103 0.034 0.457 0.07423581
## 3 Contig0 109119 0.003 0.052 0.05660377
## 4 Contig0 109123 0.002 0.067 0.02941176
## 5 Contig1  40362 0.000 0.009 0.00000000
## 6 Contig1  42880 0.003 0.078 0.03797468

# get fst for the top 5% of PC1 loadings, and save
pc1fst <- merge(positions,fst, all.x=T)[,c('contig','pos','fst')]

## Warning in `[<-.factor`(`*tmp*`, ri, value = c(74432L, 109103L, 109119L, :
## invalid factor level, NA generated

head(pc1fst)

##         contig   pos       fst
## 1 Contig101655 12681 0.1790393
## 2 Contig101935  6468 0.3543046
## 3 Contig103053  1134 0.2765957
## 4 Contig103053  1136 0.2765957
## 5 Contig103346 16201 0.1146667
## 6 Contig104078   683 0.2639138

write.table(pc1fst,"../analyses/2bRAD/Fst_PCA_allHCSS_95.tab",sep="\t",quote = F,row.names = F)

#PCA of just MBD samples

mbd = c("HC1-2B-L5","HC1-4","HC2-15A-L5","HC2-17B","HC3-1","HC3-5-L5B","HC3-7","HC3-10","HC3-11",
        "SS2-9-L5","SS2-14-L5","SS2-18-L5","SS3-3","SS3-14","SS3-15","SS3-16","SS3-20","SS5-18")
genmdb <- u.na[i=mbd,drop=TRUE]
genmdb <- genmdb[loc=isPoly(genmdb)]

## Warning in validityMethod(object): @tab does not contain integers; as of
## adegenet_2.0-0, numeric values are no longer used

genmdb

## /// GENIND OBJECT /////////
## 
##  // 18 individuals; 4,382 loci; 8,764 alleles; size: 3.9 Mb
## 
##  // Basic content
##    @tab:  18 x 8764 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 2-2)
##    @loc.fac: locus factor for the 8764 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 9-9)
##    @strata: a data frame with 2 columns ( POP_ID, INDIVIDUALS )

#pca <- dudi.pca(genmdb,cent=TRUE,scale=TRUE,scannf = T)

pcaM <- dudi.pca(genmdb,cent=TRUE,scale=TRUE,scannf = F, nf = 10)

col6 <-  c("red","blue")
par(mfrow=c(2,2))
s.class(pcaM$li, strata(genmdb)$POP_ID,xax=1,yax=2, 
        sub = "HCSS_Afilt32m70_01_pp90 MBD, PC1-2, 18 inds, 4373 SNPs ",
        possub = "topleft",col=transp(col6,.6),axesell=FALSE,
        cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
s.class(pcaM$li, strata(genmdb)$POP_ID,xax=1,yax=3, 
        sub = "HCSS_Afilt32m70_01_pp90 MBD, PC1-3, 18 inds, 4373 SNPs ",
        possub = "topleft",col=transp(col6,.6),axesell=FALSE,
        cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
s.class(pcaM$li, strata(genmdb)$POP_ID,xax=1,yax=4, 
        sub = "HCSS_Afilt32m70_01_pp90 MBD, PC1-4, 18 inds, 4373 SNPs ",
        possub = "topleft",col=transp(col6,.6),axesell=FALSE,
        cstar=0, cpoint=3, grid=FALSE, cellipse = 0)
s.class(pcaM$li, strata(genmdb)$POP_ID,xax=1,yax=5, 
        sub = "HCSS_Afilt32m70_01_pp90 MBD, PC1-5, 18 inds, 4373 SNPs ",
        possub = "topleft",col=transp(col6,.6),axesell=FALSE,
        cstar=0, cpoint=3, grid=FALSE, cellipse = 0)

eig.perc <- 100*pcaM$eig/sum(pcaM$eig)
head(eig.perc)

## [1] 12.365344  9.051154  7.673460  7.288396  6.416726  6.216463

Saving PCA

write.table(pcaM$li,file = "../analyses/2bRAD/PCA_MBD.tab",sep="\t",quote = F)
save(pcaM,  file = "../analyses/2bRAD/PCA_MBD.Robj")

##Get top loading SNPs

library(factoextra)
PCA.var.loadings <- get_pca_var(pcaM)

#data.frame(PCA.var.loadings$contrib) # % that each variable (locus) contributes to PCA dimensions. We're interested in Dim1 and Dim2. 
# Subset variable loadings for those that contribute >5% to PC axes 
loci <- rownames(subset(data.frame(PCA.var.loadings$contrib), Dim.1>0.05)) #axis 1 
loci = sapply(strsplit(loci,"\\."), '[',1)
loci <- unique(loci)
length(loci)

## [1] 110

contig <- sapply(strsplit(loci,"__"), '[',2)
pos <- sapply(strsplit(loci,"__"), '[',3)

positions <- cbind(contig,pos)
head(positions)

##      contig         pos   
## [1,] "Contig101935" "6468"
## [2,] "Contig107120" "3263"
## [3,] "Contig109827" "6152"
## [4,] "Contig112828" "139" 
## [5,] "Contig117256" "646" 
## [6,] "Contig117622" "6364"

# get fst for the top 5% of PC1 loadings, and save
pc1fstM <- merge(positions,fst, all.x=T)[,c('contig','pos','fst')]

## Warning in `[<-.factor`(`*tmp*`, ri, value = c(74432L, 109103L, 109119L, :
## invalid factor level, NA generated

head(pc1fstM)

##         contig  pos       fst
## 1 Contig101935 6468 0.3543046
## 2 Contig107120 3263 0.5111441
## 3 Contig109827 6152 0.1503132
## 4 Contig112828  139 0.2288288
## 5 Contig117256  646 0.1486989
## 6 Contig117622 6364 0.2860963

write.table(pc1fstM,"../analyses/2bRAD/Fst_PCA_MBD_95.tab",sep="\t",quote = F,row.names = F)

###Old, gets top 5% quantile

li <-pcaM$li
c1 <- pcaM$c1
colnames(c1) <- colnames(li)
#create pcaviz object
pvizM <- pcaviz(x=li,rotation=c1,dat=genmdb$strata)

pcaviz_loadingsplot(pvizM,min.rank = 0.95)

PC1snpsM = rownames(pcaM$c1[which(abs(pcaM$c1$CS1) > quantile(abs(pcaM$c1$CS1),0.95)),])
length(PC1snpsM)

## [1] 438

head(PC1snpsM)

## [1] "1__Contig101335__1205.A1__C" "1__Contig101335__1205.A2__T"
## [3] "1__Contig101935__6468.A1__G" "1__Contig101935__6468.A2__A"
## [5] "1__Contig103298__7435.A1__G" "1__Contig103298__7435.A2__C"

lociM = sapply(strsplit(PC1snpsM,"\\."), '[',1)
lociM <- unique(lociM)
contig <- sapply(strsplit(lociM,"__"), '[',2)
pos <- sapply(strsplit(lociM,"__"), '[',3)

positionsM <- cbind(contig,pos)
head(positionsM)

##      contig         pos   
## [1,] "Contig101335" "1205"
## [2,] "Contig101935" "6468"
## [3,] "Contig103298" "7435"
## [4,] "Contig106125" "3829"
## [5,] "Contig107120" "3263"
## [6,] "Contig108706" "5626"

# get fst for the top 5% of PC1 loadings, and save
pc1fstM <- merge(positionsM,fst, all.x=T)[,c('contig','pos','fst')]

## Warning in `[<-.factor`(`*tmp*`, ri, value = c(74432L, 109103L, 109119L, :
## invalid factor level, NA generated

head(pc1fstM)

##         contig  pos        fst
## 1 Contig101335 1205         NA
## 2 Contig101935 6468 0.35430464
## 3 Contig103298 7435 0.06282723
## 4 Contig106125 3829 0.27296588
## 5 Contig107120 3263 0.51114413
## 6 Contig108706 5626 0.12138728

write.table(pc1fstM,"../analyses/2bRAD/Fst_PCA_MBD_95.tab",sep="\t",quote = F,row.names = F)

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).