--- title: 'PCA of 2b RAD data' output: html_document: default html_notebook: default --- This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Ctrl+Shift+Enter*. ```{r} library(radiator) library(adegenet) library(PCAviz) ``` #PCA of all samples uses VCF ##Convert VCF to genind ```{r} data <- read_vcf("../data/HCSS_Afilt32m70_01_pp90.vcf",strata = "../../SSHCfiltNreps.strata",parallel.core = 1,vcf.stats = FALSE, filter.monomorphic=FALSE) ``` ```{r} gen <- genomic_converter(data,strata = "../../SSHCfiltNreps.strata",output = "genind", filter.monomorphic=F) ``` ##Run PCA ```{r} HCSSall <- gen$genind HCSSall save(HCSSall,file="../analyses/2bRAD/HCSS_Afilt32m70_01_pp90.genind") ``` ```{r} geno <- tab(HCSSall) geno <- geno[,seq(1,ncol(geno)-1,2)] head(geno)[,c(1:10)] write.table(geno,file="../analyses/2bRAD/HCSS_Afilt32m70_01_pp90.geno",quote = F) ``` ```{r} NA.afDraw<- function(ind){ ind.mat <- ind@tab new.mat <- ind.mat af = colSums(ind.mat[,seq(1,ncol(ind.mat)-1,2)],na.rm = TRUE)/ (2*apply(ind.mat[,seq(1,ncol(ind.mat)-1,2)],2,function(x) sum(!is.na(x)))) af.Draw <- function(geno, af){ new <- function(geno,af){ if(is.na(geno)){ newA = rbinom(1,2,af) } else {newA <- geno} return(newA) } new.row <- mapply(geno,af,FUN = new) return(new.row)} new.mat[,seq(1,ncol(ind.mat)-1,2)] <- t(apply(ind.mat[,seq(1,ncol(ind.mat)-1,2)],1,af.Draw,af)) new.mat[,seq(2,ncol(ind.mat),2)] <- 2-new.mat[,seq(1,ncol(ind.mat)-1,2)] new.ind <- ind new.ind@tab <- new.mat return(new.ind) } ``` ```{r} u.na <- NA.afDraw(gen$genind) ``` ```{r} #pca <- dudi.pca(u.na,cent=TRUE,scale=TRUE,scannf = T) ``` ```{r} pca <- dudi.pca(u.na,cent=TRUE,scale=TRUE,scannf = F, nf = 10) ``` ```{r} col6 <- c("red","blue") par(mfrow=c(2,2)) s.class(pca$li, strata(u.na)$POP_ID,xax=1,yax=2, sub = "HCSS_Afilt32m70_01_pp90, PC1-2, 114 inds, 5269 SNPs ", possub = "topleft",col=transp(col6,.6),axesell=FALSE, cstar=0, cpoint=3, grid=FALSE, cellipse = 0) s.class(pca$li, strata(u.na)$POP_ID,xax=1,yax=3, sub = "HCSS_Afilt32m70_01_pp90, PC1-3, 114 inds, 5269 SNPs ", possub = "topleft",col=transp(col6,.6),axesell=FALSE, cstar=0, cpoint=3, grid=FALSE, cellipse = 0) s.class(pca$li, strata(u.na)$POP_ID,xax=1,yax=4, sub = "HCSS_Afilt32m70_01_pp90, PC1-4, 114 inds, 5269 SNPs ", possub = "topleft",col=transp(col6,.6),axesell=FALSE, cstar=0, cpoint=3, grid=FALSE, cellipse = 0) s.class(pca$li, strata(u.na)$POP_ID,xax=1,yax=5, sub = "HCSS_Afilt32m70_01_pp90, PC1-5, 114 inds, 5269 SNPs ", possub = "topleft",col=transp(col6,.6),axesell=FALSE, cstar=0, cpoint=3, grid=FALSE, cellipse = 0) ``` ```{r} eig.perc <- 100*pca$eig/sum(pca$eig) head(eig.perc) ``` Saving PCA ```{r} write.table(pca$li,file = "../analyses/2bRAD/PCA_allHCSS.tab",sep="\t",quote = F) save(pca, file = "../analyses/2bRAD/PCA_allHCSS.Robj") ``` ##Get top loading SNPs ```{r} library(factoextra) PCA.var.loadings <- get_pca_var(pca) #data.frame(PCA.var.loadings$contrib) # % that each variable (locus) contributes to PCA dimensions. We're interested in Dim1 and Dim2. # Subset variable loadings for those that contribute >5% to PC axes loci <- rownames(subset(data.frame(PCA.var.loadings$contrib), Dim.1>0.05)) #axis 1 loci = sapply(strsplit(loci,"\\."), '[',1) loci <- unique(loci) length(loci) contig <- sapply(strsplit(loci,"__"), '[',2) pos <- sapply(strsplit(loci,"__"), '[',3) positions <- cbind(contig,pos) head(positions) ``` ```{r} # read in fst per site fst= read.csv("../analyses/2bRAD/HCSS_sfsm70_PerSiteFst.csv") head(fst) ``` ```{r} # get fst for the top 5% of PC1 loadings, and save pc1fst <- merge(positions,fst, all.x=T)[,c('contig','pos','fst')] head(pc1fst) ``` ```{r} write.table(pc1fst,"../analyses/2bRAD/Fst_PCA_allHCSS_95.tab",sep="\t",quote = F,row.names = F) ``` ##old way ```{r} li <-pca$li c1 <- pca$c1 colnames(c1) <- colnames(li) #create pcaviz object pviz <- pcaviz(x=li,rotation=c1,dat=u.na$strata) ``` ```{r} pcaviz_loadingsplot(pviz,min.rank = 0.95) ``` ```{r} PC1snps = rownames(pca$c1[which(abs(pca$c1$CS1) > quantile(abs(pca$c1$CS1),0.95)),]) length(PC1snps) head(PC1snps) ``` ```{r} loci = sapply(strsplit(PC1snps,"\\."), '[',1) loci <- unique(loci) contig <- sapply(strsplit(loci,"__"), '[',2) pos <- sapply(strsplit(loci,"__"), '[',3) positions <- cbind(contig,pos) head(positions) ``` ```{r} # read in fst per site fst= read.csv("../analyses/2bRAD/HCSS_sfsm70_PerSiteFst.csv") head(fst) ``` ```{r} # get fst for the top 5% of PC1 loadings, and save pc1fst <- merge(positions,fst, all.x=T)[,c('contig','pos','fst')] head(pc1fst) ``` ```{r} write.table(pc1fst,"../analyses/2bRAD/Fst_PCA_allHCSS_95.tab",sep="\t",quote = F,row.names = F) ``` #PCA of just MBD samples ```{r} mbd = c("HC1-2B-L5","HC1-4","HC2-15A-L5","HC2-17B","HC3-1","HC3-5-L5B","HC3-7","HC3-10","HC3-11", "SS2-9-L5","SS2-14-L5","SS2-18-L5","SS3-3","SS3-14","SS3-15","SS3-16","SS3-20","SS5-18") genmdb <- u.na[i=mbd,drop=TRUE] genmdb <- genmdb[loc=isPoly(genmdb)] genmdb ``` ```{r} mbd = c("HC1-2B-L5","HC1-4","HC2-15A-L5","HC2-17B","HC3-1","HC3-5-L5B","HC3-7","HC3-10","HC3-11", "SS2-9-L5","SS2-14-L5","SS2-18-L5","SS3-3","SS3-14","SS3-15","SS3-16","SS3-20","SS5-18") HCSS <- HCSSall[i=mbd, drop=TRUE] HCSS <- HCSS[loc=isPoly(HCSS)] HCSS genoM <- tab(HCSS) genoM <- genoM[,seq(1,ncol(genoM)-1,2)] head(genoM)[,c(1:10)] write.table(genoM,file="../analyses/2bRAD/MBD_HCSS_Afilt32m70_01_pp90.geno",quote = F) ``` ```{r} # get geno with maf > 0.05 ( excludes singletons) x <- HCSS[loc=which(minorAllele(HCSS)> 0.05)] x xM <- tab(x) xM <- xM[,seq(1,ncol(xM)-1,2)] write.table(xM,file="../analyses/2bRAD/MBD_HCSS_Afilt32m70_01_pp90_maf05.geno",quote = F) ``` ```{r} #pca <- dudi.pca(genmdb,cent=TRUE,scale=TRUE,scannf = T) ``` ```{r} pcaM <- dudi.pca(genmdb,cent=TRUE,scale=TRUE,scannf = F, nf = 10) ``` ```{r} col6 <- c("red","blue") par(mfrow=c(2,2)) s.class(pcaM$li, strata(genmdb)$POP_ID,xax=1,yax=2, sub = "HCSS_Afilt32m70_01_pp90 MBD, PC1-2, 18 inds, 4373 SNPs ", possub = "topleft",col=transp(col6,.6),axesell=FALSE, cstar=0, cpoint=3, grid=FALSE, cellipse = 0) s.class(pcaM$li, strata(genmdb)$POP_ID,xax=1,yax=3, sub = "HCSS_Afilt32m70_01_pp90 MBD, PC1-3, 18 inds, 4373 SNPs ", possub = "topleft",col=transp(col6,.6),axesell=FALSE, cstar=0, cpoint=3, grid=FALSE, cellipse = 0) s.class(pcaM$li, strata(genmdb)$POP_ID,xax=1,yax=4, sub = "HCSS_Afilt32m70_01_pp90 MBD, PC1-4, 18 inds, 4373 SNPs ", possub = "topleft",col=transp(col6,.6),axesell=FALSE, cstar=0, cpoint=3, grid=FALSE, cellipse = 0) s.class(pcaM$li, strata(genmdb)$POP_ID,xax=1,yax=5, sub = "HCSS_Afilt32m70_01_pp90 MBD, PC1-5, 18 inds, 4373 SNPs ", possub = "topleft",col=transp(col6,.6),axesell=FALSE, cstar=0, cpoint=3, grid=FALSE, cellipse = 0) ``` ```{r} eig.perc <- 100*pcaM$eig/sum(pcaM$eig) head(eig.perc) ``` Saving PCA ```{r} write.table(pcaM$li,file = "../analyses/2bRAD/PCA_MBD.tab",sep="\t",quote = F) save(pcaM, file = "../analyses/2bRAD/PCA_MBD.Robj") ``` ##Get top loading SNPs ```{r} library(factoextra) PCA.var.loadings <- get_pca_var(pcaM) #data.frame(PCA.var.loadings$contrib) # % that each variable (locus) contributes to PCA dimensions. We're interested in Dim1 and Dim2. # Subset variable loadings for those that contribute >5% to PC axes loci <- rownames(subset(data.frame(PCA.var.loadings$contrib), Dim.1>0.05)) #axis 1 loci = sapply(strsplit(loci,"\\."), '[',1) loci <- unique(loci) length(loci) contig <- sapply(strsplit(loci,"__"), '[',2) pos <- sapply(strsplit(loci,"__"), '[',3) positions <- cbind(contig,pos) head(positions) ``` ```{r} # get fst for the top 5% of PC1 loadings, and save pc1fstM <- merge(positions,fst, all.x=T)[,c('contig','pos','fst')] head(pc1fstM) ``` ```{r} write.table(pc1fstM,"../analyses/2bRAD/Fst_PCA_MBD_95.tab",sep="\t",quote = F,row.names = F) ``` ###Old, gets top 5% quantile ```{r} li <-pcaM$li c1 <- pcaM$c1 colnames(c1) <- colnames(li) #create pcaviz object pvizM <- pcaviz(x=li,rotation=c1,dat=genmdb$strata) ``` ```{r} pcaviz_loadingsplot(pvizM,min.rank = 0.95) ``` ```{r} PC1snpsM = rownames(pcaM$c1[which(abs(pcaM$c1$CS1) > quantile(abs(pcaM$c1$CS1),0.95)),]) length(PC1snpsM) head(PC1snpsM) ``` ```{r} lociM = sapply(strsplit(PC1snpsM,"\\."), '[',1) lociM <- unique(lociM) contig <- sapply(strsplit(lociM,"__"), '[',2) pos <- sapply(strsplit(lociM,"__"), '[',3) positionsM <- cbind(contig,pos) head(positionsM) ``` ```{r} # get fst for the top 5% of PC1 loadings, and save pc1fstM <- merge(positionsM,fst, all.x=T)[,c('contig','pos','fst')] head(pc1fstM) ``` ```{r} write.table(pc1fstM,"../analyses/2bRAD/Fst_PCA_MBD_95.tab",sep="\t",quote = F,row.names = F) ``` Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Ctrl+Alt+I*. When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Ctrl+Shift+K* to preview the HTML file).