1 About this report
2 Overview on the data
3 PCA on the samples
4 PCA on the genes
5 Shortlisted genes
6 Functional interpretation of the principal components
7 Multifactor exploration of the dataset
8 About pcaExplorer
- 8.1 Developers
- 8.2 Code
9 Citation info
10 Session Information

1 About this report

This content has been loaded from the template report .Rmd file. Please edit it at your best convenience!

If you are viewing this report in the Preview, you might require the installation of the PhantomJS to render correctly some HTML widgets. This can be done by using the webshot package and calling webshot::install_phantomjs(). Alternatively, the more recent webshot2 package uses the headless Chrome browser (via the chromote package, requiring Google Chrome or other Chromium-based browser).

2 Overview on the data

The data provided was used to construct the following objects

values$mydds

## class: DESeqDataSet 
## dim: 26581 32 
## metadata(1): version
## assays(1): counts
## rownames(26581): g8757.t1 g17014.t1 ... g15464.t1 g13060.t2
## rowData names(0):
## colnames(32): PSC.0085 PSC.0087 ... PSC.0231 PSC.0235
## colData names(8): coelom_ID star_ID ... sample_date sizeFactor

values$mydst

## class: DESeqTransform 
## dim: 26581 32 
## metadata(1): version
## assays(1): ''
## rownames(26581): g8757.t1 g17014.t1 ... g15464.t1 g13060.t2
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(32): PSC.0085 PSC.0087 ... PSC.0231 PSC.0235
## colData names(8): coelom_ID star_ID ... sample_date sizeFactor

values$transformation_type

## [1] "vst"

head(values$myannotation)

## NULL

The following design were used:

DT::datatable(as.data.frame(colData(values$mydds)))

An overview of the table for the features is shown here, by displaying the raw_counts

if(input$countstable_unit=="raw_counts")
  currentMat <- counts(values$mydds,normalized=FALSE)
if(input$countstable_unit=="normalized_counts")
  currentMat <- counts(values$mydds,normalized=TRUE)
if(input$countstable_unit=="rlog_counts")
  currentMat <- assay(values$mydst)
if(input$countstable_unit=="log10_counts")
  currentMat <- log10(1 + counts(values$mydds,normalized=TRUE))

DT::datatable(currentMat)

This is how the samples cluster if we use euclidean distance on the rlog transformed values

if (!is.null(input$color_by)){
  expgroups <- as.data.frame(colData(values$mydst)[,input$color_by])
  # expgroups <- interaction(expgroups)
  rownames(expgroups) <- colnames(values$mydst)
  colnames(expgroups) <- input$color_by

  pheatmap(as.matrix(dist(t(assay(values$mydst)))),annotation_col = expgroups)
} else {
  pheatmap(as.matrix(dist(t(assay(values$mydst)))))
}

This is an overview of the number of available reads in each sample (normally these are only uniquely aligned reads)

rr <- colSums(counts(values$mydds))/1e6
if(is.null(names(rr)))
  names(rr) <- paste0("sample_",1:length(rr))
rrdf <- data.frame(Reads=rr,Sample=names(rr),stringsAsFactors = FALSE)
if (!is.null(input$color_by)) {
  selGroups <- as.data.frame(colData(values$mydds)[input$color_by])
  rrdf$Group <- interaction(selGroups)
  p <- ggplot(rrdf,aes_string("Sample",weight="Reads")) + geom_bar(aes_string(fill="Group")) + theme_bw()
  p
} else {
  p <- ggplot(rrdf,aes_string("Sample",weight="Reads")) + geom_bar() + theme_bw()
  p
}

print(colSums(counts(values$mydds)))

## PSC.0085 PSC.0087 PSC.0090 PSC.0093 PSC.0095 PSC.0096 PSC.0099 PSC.0105 PSC.0118 
##  4473674  5332139  4612242  4990635  5994449  6494502  6037710  6074006  6123980 
## PSC.0119 PSC.0132 PSC.0141 PSC.0149 PSC.0150 PSC.0156 PSC.0174 PSC.0177 PSC.0186 
##  6097582  5760283  6327206  5466522  1531748  4403851  5344256  4762647  4746359 
## PSC.0187 PSC.0188 PSC.0190 PSC.0198 PSC.0202 PSC.0203 PSC.0206 PSC.0209 PSC.0217 
##  5206071  6084335  5905396  4267136  5175535  4505722  4998282  5298937  4722399 
## PSC.0219 PSC.0228 PSC.0230 PSC.0231 PSC.0235 
##  4868408  6824405  6551109  8345219  7004484

summary(colSums(counts(values$mydds))/1e6)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.532   4.759   5.338   5.448   6.088   8.345

This is a quick info on the number of detected genes

t1 <- rowSums(counts(values$mydds))
t2 <- rowMeans(counts(values$mydds,normalized=TRUE))

thresh_rowsums <- input$threshold_rowsums
thresh_rowmeans <- input$threshold_rowmeans
abs_t1 <- sum(t1 > thresh_rowsums)
rel_t1 <- 100 * mean(t1 > thresh_rowsums)
abs_t2 <- sum(t2 > thresh_rowmeans)
rel_t2 <- 100 * mean(t2 > thresh_rowmeans)

cat("Number of detected genes:\n")

## Number of detected genes:

# TODO: parametrize the thresholds
cat(abs_t1,"genes have at least a sample with more than",thresh_rowsums,"counts\n")

## 26056 genes have at least a sample with more than 0 counts

cat(paste0(round(rel_t1,3),"%"), "of the",nrow(values$mydds),"genes have at least a sample with more than",thresh_rowsums,"counts\n")

## 98.025% of the 26581 genes have at least a sample with more than 0 counts

cat(abs_t2,"genes have more than",thresh_rowmeans,"counts (normalized) on average\n")

## 26056 genes have more than 0 counts (normalized) on average

cat(paste0(round(rel_t2,3),"%"), "of the",nrow(values$mydds),"genes have more than",thresh_rowsums,"counts (normalized) on average\n")

## 98.025% of the 26581 genes have more than 0 counts (normalized) on average

cat("Counts are ranging from", min(counts(values$mydds)),"to",max(counts(values$mydds)))

## Counts are ranging from 0 to 899001

3 PCA on the samples

This plot shows how the samples are related to each other by plotting PC 1 vs PC 2, using the top 300 most variant genes

res <- pcaplot(values$mydst,intgroup = input$color_by,ntop = input$pca_nrgenes,
                     pcX = as.integer(input$pc_x),pcY = as.integer(input$pc_y),
                     text_labels = input$sample_labels,
                     point_size = input$pca_point_size, title="Samples PCA - zoom in",
                     ellipse = input$pca_ellipse, ellipse.prob = input$pca_cislider
      )
res <- res + theme_bw()
res

## Warning: ggrepel: 9 unlabeled data points (too many overlaps). Consider increasing
## max.overlaps

The scree plot helps determining the number of underlying principal components

rv <- rowVars(assay(values$mydst))
select <- order(rv, decreasing = TRUE)[seq_len(min(input$pca_nrgenes,length(rv)))]
pca <- prcomp(t(assay(values$mydst)[select, ]))

res <- pcascree(pca,type = input$scree_type, pc_nr = input$scree_pcnr, title="Scree plot for the samples PCA")
res <- res + theme_bw()
res

The genes with the highest loadings in the selected principal components are the following

rv <- rowVars(assay(values$mydst))
select <- order(rv, decreasing = TRUE)[seq_len(min(input$pca_nrgenes,length(rv)))]
pca <- prcomp(t(assay(values$mydst)[select, ]))

par(mfrow=c(2,1))
hi_loadings(pca,whichpc = as.integer(input$pc_x),topN = input$ntophiload,annotation = values$myannotation)
hi_loadings(pca,whichpc = as.integer(input$pc_y),topN = input$ntophiload,annotation = values$myannotation)

4 PCA on the genes

This plot illustrates how the top 300 variant genes are distributed in PC 1 vs PC 2

if(!is.null(input$color_by)) {
        expgroups <- as.data.frame(colData(values$mydst)[,input$color_by])
        expgroups <- interaction(expgroups)
        expgroups <- factor(expgroups,levels=unique(expgroups))

      } else {
        expgroups <- colnames(values$mydst)
      }
      colGroups <- colSel()[factor(expgroups)]

res <- genespca(values$mydst,
                ntop = input$pca_nrgenes,
                choices = c(as.integer(input$pc_x),as.integer(input$pc_y)),
                biplot = TRUE,
                arrowColors = factor(colGroups,levels=unique(colGroups)),
                groupNames = expgroups,
                alpha=input$pca_point_alpha,coordEqual=FALSE,useRownamesAsLabels=FALSE,labels.size=input$pca_label_size,
                point_size=input$pca_point_size,varname.size=input$pca_varname_size, scaleArrow = input$pca_scale_arrow,
                annotation=values$myannotation)
res

For the selected genes, this is the overall profile across all samples

if(!is.null(input$pcagenes_brush) & length(input$color_by)>0)
      geneprofiler(values$mydst,
                   genelist = curData_brush()$ids,
                   intgroup = input$color_by,
                   plotZ = input$zprofile)

And here is an interactive heatmap for that subset

if(!is.null(input$pcagenes_brush))
{
  brushedObject <- curData_brush()
  if(nrow(brushedObject) > 1){
    selectedGenes <- brushedObject$ids
    toplot <- assay(values$mydst)[selectedGenes,]
    rownames(toplot) <- values$myannotation$gene_name[match(rownames(toplot),rownames(values$myannotation))]
  
    mycolss <- c("#313695","#4575b4","#74add1","#abd9e9","#e0f3f8","#fee090","#fdae61","#f46d43","#d73027","#a50026") # to be consistent with red/blue usual coding
  
    heatmaply(toplot,Colv = as.logical(input$heatmap_colv),colors = mycolss)
  }
}

5 Shortlisted genes

This gene was selected in the interactive session.

anno_id <- rownames(values$mydst)
      anno_gene <- values$myannotation$gene_name

      # if(is.null(input$color_by) & input$genefinder!="")
      #   return(ggplot() + annotate("text",label="Select a factor to plot your gene",0,0) + theme_bw())
      # if(is.null(input$color_by) & input$genefinder=="")
      #   return(ggplot() + annotate("text",label="Select a gene and a factor to plot gene",0,0) + theme_bw())
      # if(input$genefinder=="")
      #   return(ggplot() + annotate("text",label="Type in a gene name/id",0,0) + theme_bw())
      # if(!input$genefinder %in% anno_gene & !input$genefinder %in% anno_id)
      #   return(ggplot() + annotate("text",label="Gene not found...",0,0) + theme_bw())
if(input$genefinder!="") {

  if (input$genefinder %in% anno_id) {
    selectedGene <- rownames(values$mydst)[match(input$genefinder,rownames(values$mydst))]
    selectedGeneSymbol <- values$myannotation$gene_name[match(selectedGene,rownames(values$myannotation))]
    }
    if (input$genefinder %in% anno_gene) {
    selectedGeneSymbol <- values$myannotation$gene_name[which(values$myannotation$gene_name==input$genefinder)]
    if (length(selectedGeneSymbol) > 1) return(ggplot() + annotate("text",label=paste0("Type in a gene name/id of the following:\n",paste(selectedGene,collapse=", ")),0,0) + theme_bw())
    selectedGene <- rownames(values$myannotation)[which(values$myannotation$gene_name==input$genefinder)]
    }
    genedata <- plotCounts(values$mydds,gene=selectedGene,intgroup = input$color_by,returnData = TRUE)
    onlyfactors <- genedata[,match(input$color_by,colnames(genedata))]
    genedata$plotby <- interaction(onlyfactors)
    
    if(input$plot_style=="boxplot"){
    res <- ggplot(genedata,aes_string(x="plotby",y="count",fill="plotby")) +
      geom_boxplot(outlier.shape = NA,alpha=0.7) + theme_bw()
    if(input$ylimZero){
      res <- res + scale_y_log10(name="Normalized counts - log10 scale",limits=c(0.4,NA))
    } else {
      res <- res + scale_y_log10(name="Normalized counts - log10 scale")
    }
    
    res <- res +
      labs(title=paste0("Normalized counts for ",selectedGeneSymbol," - ",selectedGene)) +
      scale_x_discrete(name="") +
      geom_jitter(aes_string(x="plotby",y="count"),position = position_jitter(width = 0.1)) +
      scale_fill_discrete(name="Experimental\nconditions")
    # exportPlots$genesBoxplot <- res
    res
    
    } else if(input$plot_style=="violin plot"){
    res <- ggplot(genedata,aes_string(x="plotby",y="count",fill="plotby")) +
      geom_violin(aes_string(col="plotby"),alpha = 0.6) + theme_bw()
    if(input$ylimZero){
      res <- res + scale_y_log10(name="Normalized counts - log10 scale",limits=c(0.4,NA))
    } else {
      res <- res + scale_y_log10(name="Normalized counts - log10 scale")
    }
    
    res <- res +
      labs(title=paste0("Normalized counts for ",selectedGeneSymbol," - ",selectedGene)) +
      scale_x_discrete(name="") +
      geom_jitter(aes_string(x="plotby",y="count"),alpha = 0.8,position = position_jitter(width = 0.1)) +
      scale_fill_discrete(name="Experimental\nconditions") + scale_color_discrete(guide="none")
    # exportPlots$genefinder <- res
    res
  }
}

## Error in .subset2(x, i, exact = exact): attempt to select less than one element in integerOneIndex

Repeat the same chunk of code and change the identifier of the gene to obtain the similar plot for the other candidates.

6 Functional interpretation of the principal components

These tables report the functional categories enriched in the genes with the top and bottom loadings in the selected principal components.

if(!is.null(values$mypca2go))
{
  goe <- values$mypca2go[[paste0("PC",input$pc_x)]][["posLoad"]]
  kable(goe, caption=paste0("Functional categories enriched in ","PC",input$pc_x, "- positive loadings"))
}

if(!is.null(values$mypca2go))
{
  goe <- values$mypca2go[[paste0("PC",input$pc_x)]][["negLoad"]]
  kable(goe, caption=paste0("Functional categories enriched in ","PC",input$pc_x, "- negative loadings"))
}

if(!is.null(values$mypca2go))
{
  goe <- values$mypca2go[[paste0("PC",input$pc_y)]][["posLoad"]]
  kable(goe, caption=paste0("Functional categories enriched in ","PC",input$pc_y, "- positive loadings"))
}

if(!is.null(values$mypca2go))
{
  goe <- values$mypca2go[[paste0("PC",input$pc_y)]][["negLoad"]]
  kable(goe, caption=paste0("Functional categories enriched in ","PC",input$pc_y, "- negative loadings"))
}

7 Multifactor exploration of the dataset

if(input$composemat > 0){
  pcmat <- obj3()[[1]]
  tcol <- obj3()[[2]]
  tcol2 <- obj3()[[3]]
  pres <- prcomp(t(pcmat),scale=FALSE)

  plot.index <- c(as.integer(input$pc_x_multifac),as.integer(input$pc_y_multifac))
  offset <- ncol(pcmat)/2
  gene.no <- offset
  pcx <- pres$x
  # set.seed(11)
    # for (i in 1:ncol(pcx)) {
    #   pcx[,i] <- pcx[,i] + rnorm(nrow(pcx),sd=diff(range(pcx[,i]))/100)
    # }
    plot(pcx[(offset+1):ncol(pcmat),plot.index[1]][1:gene.no],pcx[(offset+1):ncol(pcmat),plot.index[2]][1:gene.no],xlim=range(pcx[,plot.index[1]]),ylim=range(pcx[,plot.index[2]]),pch=20,col=tcol,cex=0.3)#,type="n")
    #plot(0,type="n",xlim=range(pres$x[,plot.index]),ylim=range(pres$x[,plot.index]))
    lcol <- ifelse(tcol != tcol2,"black","grey")
    for (i in 1:gene.no) {
      lines(pcx[c(i,offset+i),plot.index[1]],pcx[c(i,offset+i),plot.index[2]],col=lcol[i])
    }
    points(pcx[1:offset,plot.index[1]][1:gene.no],pcx[1:offset,plot.index[2]][1:gene.no],pch=20,col=tcol,cex=0.3)
    points(pcx[(offset+1):ncol(pcmat),plot.index[1]][1:gene.no],pcx[(offset+1):ncol(pcmat),plot.index[2]][1:gene.no],pch=20,col=tcol2,cex=0.3)}

8 About pcaExplorer

pcaExplorer is a Bioconductor package containing a Shiny application for analyzing expression data in different conditions and experimental factors.

pcaExplorer guides the user in exploring the Principal Components of the data, providing tools and functionality to detect outlier samples, genes that show particular patterns, and additionally provides a functional interpretation of the principal components for further quality assessment and hypothesis generation on the input data.

Thanks to its interactive/reactive design, it is designed to become a practical companion to any RNA-seq dataset analysis, making exploratory data analysis accessible also to the bench biologist, while providing additional insight also for the experienced data analyst.

pcaExplorer was developed in the Bioinformatics Division led by Harald Binder at the IMBEI (Institut für Medizinische Biometrie, Epidemiologie und Informatik) in the University Medical Center of the Johannes Gutenberg University Mainz.

8.1 Developers

pcaExplorer is currently maintained by Federico Marini at the IMBEI (www.imbei.uni-mainz.de). You can contact him by clicking on the button below.

Federico Marini

8.2 Code

pcaExplorer is a part of the Bioconductor project (www.bioconductor.org). All code for pcaExplorer, especially for the development version, is available on GitHub.

9 Citation info

If you use pcaExplorer for your analysis, please cite it as here below:

citation("pcaExplorer")

## Please cite the articles below for the 'pcaExplorer' software itself, or its
## usage in combined workflows with the 'ideal' or 'GeneTonic' software
## packages:
## 
##   Federico Marini, Harald Binder (2019). pcaExplorer: an R/Bioconductor
##   package for interacting with RNA-seq principal components. BMC
##   Bioinformatics, 20 (1), 331, <doi:10.1186/s12859-019-2879-1>,
##   <doi:10.18129/B9.bioc.pcaExplorer>.
## 
##   Annekathrin Ludt, Arsenij Ustjanzew, Harald Binder, Konstantin Strauch,
##   Federico Marini (2022). Interactive and Reproducible Workflows for
##   Exploring and Modeling RNA-seq Data with pcaExplorer, ideal, and GeneTonic.
##   Current Protocols, 2 (4), e411, <doi:10.1002/cpz1.411>.
## 
## To see these entries in BibTeX format, use 'print(<citation>, bibtex=TRUE)',
## 'toBibtex(.)', or set 'options(citation.bibtex.max=999)'.

10 Session Information

sessionInfo()

## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.3.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] BiocStyle_2.30.0            BiocManager_1.30.22        
##  [3] data.table_1.14.10          RColorBrewer_1.1-3         
##  [5] pheatmap_1.0.12             lubridate_1.9.3            
##  [7] forcats_1.0.0               stringr_1.5.1              
##  [9] dplyr_1.1.4                 purrr_1.0.2                
## [11] readr_2.1.4                 tidyr_1.3.0                
## [13] tibble_3.2.1                ggplot2_3.4.4              
## [15] tidyverse_2.0.0             DESeq2_1.42.0              
## [17] SummarizedExperiment_1.32.0 MatrixGenerics_1.14.0      
## [19] matrixStats_1.2.0           GenomicRanges_1.54.1       
## [21] GenomeInfoDb_1.38.2         IRanges_2.36.0             
## [23] S4Vectors_0.40.2            shiny_1.8.0                
## [25] pcaExplorer_2.28.0          Biobase_2.62.0             
## [27] BiocGenerics_0.48.1        
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.3.2           later_1.3.2             bitops_1.0-7           
##   [4] filelock_1.0.3          graph_1.80.0            XML_3.99-0.16          
##   [7] lifecycle_1.0.4         topGO_2.54.0            doParallel_1.0.17      
##  [10] lattice_0.22-5          crosstalk_1.2.1         dendextend_1.17.1      
##  [13] magrittr_2.0.3          sass_0.4.8              limma_3.58.1           
##  [16] plotly_4.10.3           rmarkdown_2.25          jquerylib_0.1.4        
##  [19] yaml_2.3.8              shinyBS_0.61.1          httpuv_1.6.13          
##  [22] NMF_0.26                DBI_1.1.3               abind_1.4-5            
##  [25] zlibbioc_1.48.0         RCurl_1.98-1.13         rappdirs_0.3.3         
##  [28] seriation_1.5.4         GenomeInfoDbData_1.2.11 ggrepel_0.9.4          
##  [31] AnnotationForge_1.44.0  genefilter_1.84.0       annotate_1.80.0        
##  [34] commonmark_1.9.0        codetools_0.2-19        DelayedArray_0.28.0    
##  [37] DT_0.31                 xml2_1.3.6              tidyselect_1.2.0       
##  [40] GOstats_2.68.0          farver_2.1.1            viridis_0.6.4          
##  [43] TSP_1.2-4               BiocFileCache_2.10.1    base64enc_0.1-3        
##  [46] webshot_0.5.5           jsonlite_1.8.8          ellipsis_0.3.2         
##  [49] survival_3.5-7          iterators_1.0.14        systemfonts_1.0.5      
##  [52] foreach_1.5.2           tools_4.3.2             progress_1.2.3         
##  [55] ragg_1.2.7              Rcpp_1.0.11             glue_1.6.2             
##  [58] gridExtra_2.3           SparseArray_1.2.2       xfun_0.41              
##  [61] ca_0.71.1               withr_2.5.2             shinydashboard_0.7.2   
##  [64] Category_2.68.0         fastmap_1.1.1           fansi_1.0.6            
##  [67] SparseM_1.81            digest_0.6.33           timechange_0.2.0       
##  [70] R6_2.5.1                mime_0.12               textshaping_0.3.7      
##  [73] colorspace_2.1-0        GO.db_3.18.0            airway_1.22.0          
##  [76] markdown_1.12           biomaRt_2.58.0          RSQLite_2.3.4          
##  [79] threejs_0.3.3           utf8_1.2.4              generics_0.1.3         
##  [82] prettyunits_1.2.0       httr_1.4.7              htmlwidgets_1.6.4      
##  [85] S4Arrays_1.2.0          pkgconfig_2.0.3         gtable_0.3.4           
##  [88] blob_1.2.4              registry_0.5-1          XVector_0.42.0         
##  [91] htmltools_0.5.7         RBGL_1.78.0             GSEABase_1.64.0        
##  [94] scales_1.3.0            png_0.1-8               knitr_1.45             
##  [97] rstudioapi_0.15.0       tzdb_0.4.0              reshape2_1.4.4         
## [100] curl_5.2.0              shinyAce_0.4.2          cachem_1.0.8           
## [103] parallel_4.3.2          AnnotationDbi_1.64.1    pillar_1.9.0           
## [106] grid_4.3.2              vctrs_0.6.5             promises_1.2.1         
## [109] dbplyr_2.4.0            xtable_1.8-4            cluster_2.1.6          
## [112] Rgraphviz_2.46.0        evaluate_0.23           cli_3.6.2              
## [115] locfit_1.5-9.8          compiler_4.3.2          rlang_1.1.2            
## [118] crayon_1.5.2            rngtools_1.5.2          heatmaply_1.5.0        
## [121] labeling_0.4.3          plyr_1.8.9              stringi_1.8.3          
## [124] viridisLite_0.4.2       gridBase_0.4-7          BiocParallel_1.36.0    
## [127] assertthat_0.2.1        munsell_0.5.0           Biostrings_2.70.1      
## [130] lazyeval_0.2.2          Matrix_1.6-4            hms_1.1.3              
## [133] bit64_4.0.5             KEGGREST_1.42.0         statmod_1.5.0          
## [136] highr_0.10              fontawesome_0.5.2       igraph_1.6.0           
## [139] memoise_2.0.1           bslib_0.6.1             bit_4.0.5