1 About this report
2 Overview on the data
3 PCA on the samples
4 PCA on the genes
5 Shortlisted genes
6 Functional interpretation of the principal components
7 Multifactor exploration of the dataset
8 About pcaExplorer
- 8.1 Developers
- 8.2 Code
9 Citation info
10 Session Information

1 About this report

This content has been loaded from the template report .Rmd file. Please edit it at your best convenience!

If you are viewing this report in the Preview, you might require the installation of the PhantomJS to render correctly some HTML widgets. This can be done by using the webshot package and calling webshot::install_phantomjs(). Alternatively, the more recent webshot2 package uses the headless Chrome browser (via the chromote package, requiring Google Chrome or other Chromium-based browser).

2 Overview on the data

The data provided was used to construct the following objects

values$mydds

## class: DESeqDataSet 
## dim: 26581 8 
## metadata(1): version
## assays(1): counts
## rownames(26581): g8757.t1 g17014.t1 ... g15464.t1 g13060.t2
## rowData names(0):
## colnames(8): PSC.0149 PSC.0150 ... PSC.0228 PSC.0231
## colData names(8): coelom_ID star_ID ... sample_date sizeFactor

values$mydst

## class: DESeqTransform 
## dim: 26581 8 
## metadata(1): version
## assays(1): ''
## rownames(26581): g8757.t1 g17014.t1 ... g15464.t1 g13060.t2
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(8): PSC.0149 PSC.0150 ... PSC.0228 PSC.0231
## colData names(8): coelom_ID star_ID ... sample_date sizeFactor

values$transformation_type

## [1] "vst"

head(values$myannotation)

## NULL

The following design were used:

DT::datatable(as.data.frame(colData(values$mydds)))

An overview of the table for the features is shown here, by displaying the raw_counts

if(input$countstable_unit=="raw_counts")
  currentMat <- counts(values$mydds,normalized=FALSE)
if(input$countstable_unit=="normalized_counts")
  currentMat <- counts(values$mydds,normalized=TRUE)
if(input$countstable_unit=="rlog_counts")
  currentMat <- assay(values$mydst)
if(input$countstable_unit=="log10_counts")
  currentMat <- log10(1 + counts(values$mydds,normalized=TRUE))

DT::datatable(currentMat)

This is how the samples cluster if we use euclidean distance on the rlog transformed values

if (!is.null(input$color_by)){
  expgroups <- as.data.frame(colData(values$mydst)[,input$color_by])
  # expgroups <- interaction(expgroups)
  rownames(expgroups) <- colnames(values$mydst)
  colnames(expgroups) <- input$color_by

  pheatmap(as.matrix(dist(t(assay(values$mydst)))),annotation_col = expgroups)
} else {
  pheatmap(as.matrix(dist(t(assay(values$mydst)))))
}

This is an overview of the number of available reads in each sample (normally these are only uniquely aligned reads)

rr <- colSums(counts(values$mydds))/1e6
if(is.null(names(rr)))
  names(rr) <- paste0("sample_",1:length(rr))
rrdf <- data.frame(Reads=rr,Sample=names(rr),stringsAsFactors = FALSE)
if (!is.null(input$color_by)) {
  selGroups <- as.data.frame(colData(values$mydds)[input$color_by])
  rrdf$Group <- interaction(selGroups)
  p <- ggplot(rrdf,aes_string("Sample",weight="Reads")) + geom_bar(aes_string(fill="Group")) + theme_bw()
  p
} else {
  p <- ggplot(rrdf,aes_string("Sample",weight="Reads")) + geom_bar() + theme_bw()
  p
}

print(colSums(counts(values$mydds)))

## PSC.0149 PSC.0150 PSC.0174 PSC.0187 PSC.0188 PSC.0190 PSC.0228 PSC.0231 
##  5466522  1531748  5344256  5206071  6084335  5905396  6824405  8345219

summary(colSums(counts(values$mydds))/1e6)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.532   5.310   5.686   5.588   6.269   8.345

This is a quick info on the number of detected genes

t1 <- rowSums(counts(values$mydds))
t2 <- rowMeans(counts(values$mydds,normalized=TRUE))

thresh_rowsums <- input$threshold_rowsums
thresh_rowmeans <- input$threshold_rowmeans
abs_t1 <- sum(t1 > thresh_rowsums)
rel_t1 <- 100 * mean(t1 > thresh_rowsums)
abs_t2 <- sum(t2 > thresh_rowmeans)
rel_t2 <- 100 * mean(t2 > thresh_rowmeans)

cat("Number of detected genes:\n")

## Number of detected genes:

# TODO: parametrize the thresholds
cat(abs_t1,"genes have at least a sample with more than",thresh_rowsums,"counts\n")

## 25356 genes have at least a sample with more than 0 counts

cat(paste0(round(rel_t1,3),"%"), "of the",nrow(values$mydds),"genes have at least a sample with more than",thresh_rowsums,"counts\n")

## 95.391% of the 26581 genes have at least a sample with more than 0 counts

cat(abs_t2,"genes have more than",thresh_rowmeans,"counts (normalized) on average\n")

## 25356 genes have more than 0 counts (normalized) on average

cat(paste0(round(rel_t2,3),"%"), "of the",nrow(values$mydds),"genes have more than",thresh_rowsums,"counts (normalized) on average\n")

## 95.391% of the 26581 genes have more than 0 counts (normalized) on average

cat("Counts are ranging from", min(counts(values$mydds)),"to",max(counts(values$mydds)))

## Counts are ranging from 0 to 899001

3 PCA on the samples

This plot shows how the samples are related to each other by plotting PC 1 vs PC 2, using the top 300 most variant genes

res <- pcaplot(values$mydst,intgroup = input$color_by,ntop = input$pca_nrgenes,
                     pcX = as.integer(input$pc_x),pcY = as.integer(input$pc_y),
                     text_labels = input$sample_labels,
                     point_size = input$pca_point_size, title="Samples PCA - zoom in",
                     ellipse = input$pca_ellipse, ellipse.prob = input$pca_cislider
      )
res <- res + theme_bw()
res

The scree plot helps determining the number of underlying principal components

rv <- rowVars(assay(values$mydst))
select <- order(rv, decreasing = TRUE)[seq_len(min(input$pca_nrgenes,length(rv)))]
pca <- prcomp(t(assay(values$mydst)[select, ]))

res <- pcascree(pca,type = input$scree_type, pc_nr = input$scree_pcnr, title="Scree plot for the samples PCA")
res <- res + theme_bw()
res

The genes with the highest loadings in the selected principal components are the following

rv <- rowVars(assay(values$mydst))
select <- order(rv, decreasing = TRUE)[seq_len(min(input$pca_nrgenes,length(rv)))]
pca <- prcomp(t(assay(values$mydst)[select, ]))

par(mfrow=c(2,1))
hi_loadings(pca,whichpc = as.integer(input$pc_x),topN = input$ntophiload,annotation = values$myannotation)
hi_loadings(pca,whichpc = as.integer(input$pc_y),topN = input$ntophiload,annotation = values$myannotation)

4 PCA on the genes

This plot illustrates how the top 300 variant genes are distributed in PC 1 vs PC 2

if(!is.null(input$color_by)) {
        expgroups <- as.data.frame(colData(values$mydst)[,input$color_by])
        expgroups <- interaction(expgroups)
        expgroups <- factor(expgroups,levels=unique(expgroups))

      } else {
        expgroups <- colnames(values$mydst)
      }
      colGroups <- colSel()[factor(expgroups)]

res <- genespca(values$mydst,
                ntop = input$pca_nrgenes,
                choices = c(as.integer(input$pc_x),as.integer(input$pc_y)),
                biplot = TRUE,
                arrowColors = factor(colGroups,levels=unique(colGroups)),
                groupNames = expgroups,
                alpha=input$pca_point_alpha,coordEqual=FALSE,useRownamesAsLabels=FALSE,labels.size=input$pca_label_size,
                point_size=input$pca_point_size,varname.size=input$pca_varname_size, scaleArrow = input$pca_scale_arrow,
                annotation=values$myannotation)
res

For the selected genes, this is the overall profile across all samples

if(!is.null(input$pcagenes_brush) & length(input$color_by)>0)
      geneprofiler(values$mydst,
                   genelist = curData_brush()$ids,
                   intgroup = input$color_by,
                   plotZ = input$zprofile)

And here is an interactive heatmap for that subset

if(!is.null(input$pcagenes_brush))
{
  brushedObject <- curData_brush()
  if(nrow(brushedObject) > 1){
    selectedGenes <- brushedObject$ids
    toplot <- assay(values$mydst)[selectedGenes,]
    rownames(toplot) <- values$myannotation$gene_name[match(rownames(toplot),rownames(values$myannotation))]
  
    mycolss <- c("#313695","#4575b4","#74add1","#abd9e9","#e0f3f8","#fee090","#fdae61","#f46d43","#d73027","#a50026") # to be consistent with red/blue usual coding
  
    heatmaply(toplot,Colv = as.logical(input$heatmap_colv),colors = mycolss)
  }
}

5 Shortlisted genes

This gene was selected in the interactive session.

anno_id <- rownames(values$mydst)
      anno_gene <- values$myannotation$gene_name

      # if(is.null(input$color_by) & input$genefinder!="")
      #   return(ggplot() + annotate("text",label="Select a factor to plot your gene",0,0) + theme_bw())
      # if(is.null(input$color_by) & input$genefinder=="")
      #   return(ggplot() + annotate("text",label="Select a gene and a factor to plot gene",0,0) + theme_bw())
      # if(input$genefinder=="")
      #   return(ggplot() + annotate("text",label="Type in a gene name/id",0,0) + theme_bw())
      # if(!input$genefinder %in% anno_gene & !input$genefinder %in% anno_id)
      #   return(ggplot() + annotate("text",label="Gene not found...",0,0) + theme_bw())
if(input$genefinder!="") {

  if (input$genefinder %in% anno_id) {
    selectedGene <- rownames(values$mydst)[match(input$genefinder,rownames(values$mydst))]
    selectedGeneSymbol <- values$myannotation$gene_name[match(selectedGene,rownames(values$myannotation))]
    }
    if (input$genefinder %in% anno_gene) {
    selectedGeneSymbol <- values$myannotation$gene_name[which(values$myannotation$gene_name==input$genefinder)]
    if (length(selectedGeneSymbol) > 1) return(ggplot() + annotate("text",label=paste0("Type in a gene name/id of the following:\n",paste(selectedGene,collapse=", ")),0,0) + theme_bw())
    selectedGene <- rownames(values$myannotation)[which(values$myannotation$gene_name==input$genefinder)]
    }
    genedata <- plotCounts(values$mydds,gene=selectedGene,intgroup = input$color_by,returnData = TRUE)
    onlyfactors <- genedata[,match(input$color_by,colnames(genedata))]
    genedata$plotby <- interaction(onlyfactors)
    
    if(input$plot_style=="boxplot"){
    res <- ggplot(genedata,aes_string(x="plotby",y="count",fill="plotby")) +
      geom_boxplot(outlier.shape = NA,alpha=0.7) + theme_bw()
    if(input$ylimZero){
      res <- res + scale_y_log10(name="Normalized counts - log10 scale",limits=c(0.4,NA))
    } else {
      res <- res + scale_y_log10(name="Normalized counts - log10 scale")
    }
    
    res <- res +
      labs(title=paste0("Normalized counts for ",selectedGeneSymbol," - ",selectedGene)) +
      scale_x_discrete(name="") +
      geom_jitter(aes_string(x="plotby",y="count"),position = position_jitter(width = 0.1)) +
      scale_fill_discrete(name="Experimental\nconditions")
    # exportPlots$genesBoxplot <- res
    res
    
    } else if(input$plot_style=="violin plot"){
    res <- ggplot(genedata,aes_string(x="plotby",y="count",fill="plotby")) +
      geom_violin(aes_string(col="plotby"),alpha = 0.6) + theme_bw()
    if(input$ylimZero){
      res <- res + scale_y_log10(name="Normalized counts - log10 scale",limits=c(0.4,NA))
    } else {
      res <- res + scale_y_log10(name="Normalized counts - log10 scale")
    }
    
    res <- res +
      labs(title=paste0("Normalized counts for ",selectedGeneSymbol," - ",selectedGene)) +
      scale_x_discrete(name="") +
      geom_jitter(aes_string(x="plotby",y="count"),alpha = 0.8,position = position_jitter(width = 0.1)) +
      scale_fill_discrete(name="Experimental\nconditions") + scale_color_discrete(guide="none")
    # exportPlots$genefinder <- res
    res
  }
}

Repeat the same chunk of code and change the identifier of the gene to obtain the similar plot for the other candidates.

6 Functional interpretation of the principal components

These tables report the functional categories enriched in the genes with the top and bottom loadings in the selected principal components.

if(!is.null(values$mypca2go))
{
  goe <- values$mypca2go[[paste0("PC",input$pc_x)]][["posLoad"]]
  kable(goe, caption=paste0("Functional categories enriched in ","PC",input$pc_x, "- positive loadings"))
}

if(!is.null(values$mypca2go))
{
  goe <- values$mypca2go[[paste0("PC",input$pc_x)]][["negLoad"]]
  kable(goe, caption=paste0("Functional categories enriched in ","PC",input$pc_x, "- negative loadings"))
}

if(!is.null(values$mypca2go))
{
  goe <- values$mypca2go[[paste0("PC",input$pc_y)]][["posLoad"]]
  kable(goe, caption=paste0("Functional categories enriched in ","PC",input$pc_y, "- positive loadings"))
}

if(!is.null(values$mypca2go))
{
  goe <- values$mypca2go[[paste0("PC",input$pc_y)]][["negLoad"]]
  kable(goe, caption=paste0("Functional categories enriched in ","PC",input$pc_y, "- negative loadings"))
}

7 Multifactor exploration of the dataset

if(input$composemat > 0){
  pcmat <- obj3()[[1]]
  tcol <- obj3()[[2]]
  tcol2 <- obj3()[[3]]
  pres <- prcomp(t(pcmat),scale=FALSE)

  plot.index <- c(as.integer(input$pc_x_multifac),as.integer(input$pc_y_multifac))
  offset <- ncol(pcmat)/2
  gene.no <- offset
  pcx <- pres$x
  # set.seed(11)
    # for (i in 1:ncol(pcx)) {
    #   pcx[,i] <- pcx[,i] + rnorm(nrow(pcx),sd=diff(range(pcx[,i]))/100)
    # }
    plot(pcx[(offset+1):ncol(pcmat),plot.index[1]][1:gene.no],pcx[(offset+1):ncol(pcmat),plot.index[2]][1:gene.no],xlim=range(pcx[,plot.index[1]]),ylim=range(pcx[,plot.index[2]]),pch=20,col=tcol,cex=0.3)#,type="n")
    #plot(0,type="n",xlim=range(pres$x[,plot.index]),ylim=range(pres$x[,plot.index]))
    lcol <- ifelse(tcol != tcol2,"black","grey")
    for (i in 1:gene.no) {
      lines(pcx[c(i,offset+i),plot.index[1]],pcx[c(i,offset+i),plot.index[2]],col=lcol[i])
    }
    points(pcx[1:offset,plot.index[1]][1:gene.no],pcx[1:offset,plot.index[2]][1:gene.no],pch=20,col=tcol,cex=0.3)
    points(pcx[(offset+1):ncol(pcmat),plot.index[1]][1:gene.no],pcx[(offset+1):ncol(pcmat),plot.index[2]][1:gene.no],pch=20,col=tcol2,cex=0.3)}

8 About pcaExplorer

pcaExplorer is a Bioconductor package containing a Shiny application for analyzing expression data in different conditions and experimental factors.

pcaExplorer guides the user in exploring the Principal Components of the data, providing tools and functionality to detect outlier samples, genes that show particular patterns, and additionally provides a functional interpretation of the principal components for further quality assessment and hypothesis generation on the input data.

Thanks to its interactive/reactive design, it is designed to become a practical companion to any RNA-seq dataset analysis, making exploratory data analysis accessible also to the bench biologist, while providing additional insight also for the experienced data analyst.

pcaExplorer was developed in the Bioinformatics Division led by Harald Binder at the IMBEI (Institut für Medizinische Biometrie, Epidemiologie und Informatik) in the University Medical Center of the Johannes Gutenberg University Mainz.

8.1 Developers

pcaExplorer is currently maintained by Federico Marini at the IMBEI (www.imbei.uni-mainz.de). You can contact him by clicking on the button below.

Federico Marini

8.2 Code

pcaExplorer is a part of the Bioconductor project (www.bioconductor.org). All code for pcaExplorer, especially for the development version, is available on GitHub.

9 Citation info

If you use pcaExplorer for your analysis, please cite it as here below:

citation("pcaExplorer")

## Please cite the articles below for the 'pcaExplorer' software itself, or its
## usage in combined workflows with the 'ideal' or 'GeneTonic' software
## packages:
## 
##   Federico Marini, Harald Binder (2019). pcaExplorer: an R/Bioconductor
##   package for interacting with RNA-seq principal components. BMC
##   Bioinformatics, 20 (1), 331, <doi:10.1186/s12859-019-2879-1>,
##   <doi:10.18129/B9.bioc.pcaExplorer>.
## 
##   Annekathrin Ludt, Arsenij Ustjanzew, Harald Binder, Konstantin Strauch,
##   Federico Marini (2022). Interactive and Reproducible Workflows for
##   Exploring and Modeling RNA-seq Data with pcaExplorer, ideal, and GeneTonic.
##   Current Protocols, 2 (4), e411, <doi:10.1002/cpz1.411>.
## 
## To see these entries in BibTeX format, use 'print(<citation>, bibtex=TRUE)',
## 'toBibtex(.)', or set 'options(citation.bibtex.max=999)'.

10 Session Information

sessionInfo()

## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.3.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] BiocStyle_2.30.0            BiocManager_1.30.22        
##  [3] data.table_1.14.10          RColorBrewer_1.1-3         
##  [5] pheatmap_1.0.12             lubridate_1.9.3            
##  [7] forcats_1.0.0               stringr_1.5.1              
##  [9] dplyr_1.1.4                 purrr_1.0.2                
## [11] readr_2.1.4                 tidyr_1.3.0                
## [13] tibble_3.2.1                ggplot2_3.4.4              
## [15] tidyverse_2.0.0             DESeq2_1.42.0              
## [17] SummarizedExperiment_1.32.0 MatrixGenerics_1.14.0      
## [19] matrixStats_1.2.0           GenomicRanges_1.54.1       
## [21] GenomeInfoDb_1.38.2         IRanges_2.36.0             
## [23] S4Vectors_0.40.2            shiny_1.8.0                
## [25] pcaExplorer_2.28.0          Biobase_2.62.0             
## [27] BiocGenerics_0.48.1        
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.3.2           later_1.3.2             bitops_1.0-7           
##   [4] filelock_1.0.3          graph_1.80.0            XML_3.99-0.16          
##   [7] lifecycle_1.0.4         topGO_2.54.0            doParallel_1.0.17      
##  [10] lattice_0.22-5          crosstalk_1.2.1         dendextend_1.17.1      
##  [13] magrittr_2.0.3          sass_0.4.8              limma_3.58.1           
##  [16] plotly_4.10.3           rmarkdown_2.25          jquerylib_0.1.4        
##  [19] yaml_2.3.8              shinyBS_0.61.1          httpuv_1.6.13          
##  [22] NMF_0.26                DBI_1.1.3               abind_1.4-5            
##  [25] zlibbioc_1.48.0         RCurl_1.98-1.13         rappdirs_0.3.3         
##  [28] seriation_1.5.4         GenomeInfoDbData_1.2.11 ggrepel_0.9.4          
##  [31] AnnotationForge_1.44.0  genefilter_1.84.0       annotate_1.80.0        
##  [34] commonmark_1.9.0        codetools_0.2-19        DelayedArray_0.28.0    
##  [37] DT_0.31                 xml2_1.3.6              tidyselect_1.2.0       
##  [40] GOstats_2.68.0          farver_2.1.1            viridis_0.6.4          
##  [43] TSP_1.2-4               BiocFileCache_2.10.1    base64enc_0.1-3        
##  [46] webshot_0.5.5           jsonlite_1.8.8          ellipsis_0.3.2         
##  [49] survival_3.5-7          iterators_1.0.14        systemfonts_1.0.5      
##  [52] foreach_1.5.2           tools_4.3.2             progress_1.2.3         
##  [55] ragg_1.2.7              Rcpp_1.0.11             glue_1.6.2             
##  [58] gridExtra_2.3           SparseArray_1.2.2       xfun_0.41              
##  [61] ca_0.71.1               withr_2.5.2             shinydashboard_0.7.2   
##  [64] Category_2.68.0         fastmap_1.1.1           fansi_1.0.6            
##  [67] SparseM_1.81            digest_0.6.33           timechange_0.2.0       
##  [70] R6_2.5.1                mime_0.12               textshaping_0.3.7      
##  [73] colorspace_2.1-0        GO.db_3.18.0            airway_1.22.0          
##  [76] markdown_1.12           biomaRt_2.58.0          RSQLite_2.3.4          
##  [79] threejs_0.3.3           utf8_1.2.4              generics_0.1.3         
##  [82] prettyunits_1.2.0       httr_1.4.7              htmlwidgets_1.6.4      
##  [85] S4Arrays_1.2.0          pkgconfig_2.0.3         gtable_0.3.4           
##  [88] blob_1.2.4              registry_0.5-1          XVector_0.42.0         
##  [91] htmltools_0.5.7         RBGL_1.78.0             GSEABase_1.64.0        
##  [94] scales_1.3.0            png_0.1-8               knitr_1.45             
##  [97] rstudioapi_0.15.0       tzdb_0.4.0              reshape2_1.4.4         
## [100] curl_5.2.0              shinyAce_0.4.2          cachem_1.0.8           
## [103] parallel_4.3.2          AnnotationDbi_1.64.1    pillar_1.9.0           
## [106] grid_4.3.2              vctrs_0.6.5             promises_1.2.1         
## [109] dbplyr_2.4.0            xtable_1.8-4            cluster_2.1.6          
## [112] Rgraphviz_2.46.0        evaluate_0.23           cli_3.6.2              
## [115] locfit_1.5-9.8          compiler_4.3.2          rlang_1.1.2            
## [118] crayon_1.5.2            rngtools_1.5.2          heatmaply_1.5.0        
## [121] labeling_0.4.3          plyr_1.8.9              stringi_1.8.3          
## [124] viridisLite_0.4.2       gridBase_0.4-7          BiocParallel_1.36.0    
## [127] assertthat_0.2.1        munsell_0.5.0           Biostrings_2.70.1      
## [130] lazyeval_0.2.2          Matrix_1.6-4            hms_1.1.3              
## [133] bit64_4.0.5             KEGGREST_1.42.0         statmod_1.5.0          
## [136] highr_0.10              fontawesome_0.5.2       igraph_1.6.0           
## [139] memoise_2.0.1           bslib_0.6.1             bit_4.0.5