load packages

### only run if packages not already installed

# if (!requireNamespace("BiocManager", quietly = TRUE))
#  install.packages("BiocManager")
# BiocManager::install("biocLite")
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse()     masks IRanges::collapse()
## ✖ dplyr::combine()      masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count()        masks matrixStats::count()
## ✖ dplyr::desc()         masks IRanges::desc()
## ✖ tidyr::expand()       masks S4Vectors::expand()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ dplyr::first()        masks S4Vectors::first()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ ggplot2::Position()   masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()       masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()       masks S4Vectors::rename()
## ✖ lubridate::second()   masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice()        masks IRanges::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

DESeq2

load in count data

options(scipen = 999) #disables printing results in scientific notation

data <- read_delim("../output/kallisto.isoform.counts.matrix") 
## New names:
## Rows: 64154 Columns: 31
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): ...1 dbl (30): M-C-193, M-C-216, M-C-218, M-C-226, M-C-306, M-C-329,
## M-C-334, M-C...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
## here we want to specify our row names from out first column of data. it can be a little funky, but saving this order should work:

# create object for row names from the first column
# used the $ index because brackets were giving trouble
rownames <- data$...1
head(rownames)
## [1] "XM_060701415.1" "XM_060721439.1" "XM_060696058.1" "XR_009625421.1"
## [5] "XM_060730230.1" "XM_060707611.1"
# remove that first column now that it's values are stored
data <- data[,-1]
head(data)
## # A tibble: 6 × 30
##   `M-C-193` `M-C-216` `M-C-218` `M-C-226` `M-C-306` `M-C-329` `M-C-334`
##       <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
## 1       18        19        58        16        29        28        31 
## 2        0         0         0         1         0         0         0 
## 3        0         0         0         0         0         0         0 
## 4      286.      179.      235.      445.      122.      139.      257.
## 5      811       694       866       869       640       864       482.
## 6      663.      360.      645.      400.      182.      259.      239.
## # ℹ 23 more variables: `M-C-337` <dbl>, `M-C-339` <dbl>, `M-C-358` <dbl>,
## #   `M-C-360` <dbl>, `M-C-363` <dbl>, `M-C-373` <dbl>, `M-C-482` <dbl>,
## #   `M-C-488` <dbl>, `M-T-18` <dbl>, `M-T-20` <dbl>, `M-T-22` <dbl>,
## #   `M-T-235` <dbl>, `M-T-245` <dbl>, `M-T-29` <dbl>, `M-T-31` <dbl>,
## #   `M-T-32` <dbl>, `M-T-399` <dbl>, `M-T-43` <dbl>, `M-T-44` <dbl>,
## #   `M-T-500` <dbl>, `M-T-7` <dbl>, `M-T-83` <dbl>, `M-T-8` <dbl>
#ensures all data as integer
data <- round(data, digits = 0)
head(data)
## # A tibble: 6 × 30
##   `M-C-193` `M-C-216` `M-C-218` `M-C-226` `M-C-306` `M-C-329` `M-C-334`
##       <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
## 1        18        19        58        16        29        28        31
## 2         0         0         0         1         0         0         0
## 3         0         0         0         0         0         0         0
## 4       286       179       235       445       122       139       257
## 5       811       694       866       869       640       864       482
## 6       663       360       645       400       182       259       239
## # ℹ 23 more variables: `M-C-337` <dbl>, `M-C-339` <dbl>, `M-C-358` <dbl>,
## #   `M-C-360` <dbl>, `M-C-363` <dbl>, `M-C-373` <dbl>, `M-C-482` <dbl>,
## #   `M-C-488` <dbl>, `M-T-18` <dbl>, `M-T-20` <dbl>, `M-T-22` <dbl>,
## #   `M-T-235` <dbl>, `M-T-245` <dbl>, `M-T-29` <dbl>, `M-T-31` <dbl>,
## #   `M-T-32` <dbl>, `M-T-399` <dbl>, `M-T-43` <dbl>, `M-T-44` <dbl>,
## #   `M-T-500` <dbl>, `M-T-7` <dbl>, `M-T-83` <dbl>, `M-T-8` <dbl>
# assign the row names to the data using the object created 
rownames(data) <- rownames
## Warning: Setting row names on a tibble is deprecated.
head(data)
## # A tibble: 6 × 30
##   `M-C-193` `M-C-216` `M-C-218` `M-C-226` `M-C-306` `M-C-329` `M-C-334`
##       <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
## 1        18        19        58        16        29        28        31
## 2         0         0         0         1         0         0         0
## 3         0         0         0         0         0         0         0
## 4       286       179       235       445       122       139       257
## 5       811       694       866       869       640       864       482
## 6       663       360       645       400       182       259       239
## # ℹ 23 more variables: `M-C-337` <dbl>, `M-C-339` <dbl>, `M-C-358` <dbl>,
## #   `M-C-360` <dbl>, `M-C-363` <dbl>, `M-C-373` <dbl>, `M-C-482` <dbl>,
## #   `M-C-488` <dbl>, `M-T-18` <dbl>, `M-T-20` <dbl>, `M-T-22` <dbl>,
## #   `M-T-235` <dbl>, `M-T-245` <dbl>, `M-T-29` <dbl>, `M-T-31` <dbl>,
## #   `M-T-32` <dbl>, `M-T-399` <dbl>, `M-T-43` <dbl>, `M-T-44` <dbl>,
## #   `M-T-500` <dbl>, `M-T-7` <dbl>, `M-T-83` <dbl>, `M-T-8` <dbl>

build objects; specify which columns are in groups

deseq2.colData <- data.frame(condition = factor(c(rep("Control", 15), rep("Treatment", 15))), type = factor(rep("paired-read", 30)))

rownames(deseq2.colData) <- colnames(data)

deseq2.dds <- DESeqDataSetFromMatrix(countData = data,
                                     colData = deseq2.colData, 
                                     design = ~ condition)
## converting counts to integer mode

run analysis

deseq2.dds <- DESeq(deseq2.dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 4515 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
deseq2.res <- results(deseq2.dds)
deseq2.res <- deseq2.res[order(rownames(deseq2.res)),]

summary(deseq2.res)
## 
## out of 55171 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 59, 0.11%
## LFC < 0 (down)     : 76, 0.14%
## outliers [1]       : 0, 0%
## low counts [2]     : 19624, 36%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Count number of hits with adjusted p-value less then 0.05

DEG <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]

tmp <- deseq2.res

write.table(DEG,"../output/1213-DEG.tab", row.names = T)

Visualization

Volcano

plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-10, 10), log="x", col="darkgray",
     main="Treatmetn versus Control  (pval <= 0.05)",
     xlab="mean of normalized counts",
     ylab="Log2 Fold Change")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 8983 x values <= 0 omitted
## from logarithmic plot
# Getting the significant points and plotting them again so they're a different color
tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red")

# 2 FC lines
abline(h=c(-1,1), col="blue")

Heatmap

#load libraries for heatmap
library("RColorBrewer")
# library( "genefilter" )
library("pheatmap")
#Regularized log transformation
rld <- rlog( deseq2.dds )
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
#Get 25 top varying genes
topVarGenes <- head( order( rowVars (x = assay(rld), useNames = FALSE ), decreasing=TRUE ), 25)
# make heatmap of top 25 differentially expressed genes
# get top 25 counts
top25Counts<-assay(rld)[topVarGenes,]
head(top25Counts)
##                   M-C-193     M-C-216     M-C-218    M-C-226     M-C-306
## XM_060697578.1 -0.2396969 -0.18519590 -0.15532217 -0.3389453 -0.18122668
## XM_060720708.1  2.6151479 13.50027330  2.74757874  2.4672732  2.70630972
## XM_060721026.1 -0.3119173 -0.25784144 -0.22820074 -0.4103915 -0.25390318
## XM_060725432.1 -0.2856934 -0.23233585 -0.20308889 -0.3828595 -0.22844991
## XM_060711350.1 -0.8006905 -0.73502393 -0.69903002 -0.9202717 -0.73024154
## XM_060736339.1 -0.1493521 -0.09996906 -0.07290061 -0.2392807 -0.09637257
##                    M-C-329    M-C-334    M-C-337    M-C-339    M-C-358
## XM_060697578.1 -0.14192841 -0.1880788 -0.2547161 -0.2191926 -0.1916747
## XM_060720708.1 13.86004624 11.9786052  8.3851204  8.3194180  4.9329131
## XM_060721026.1 -0.21491146 -0.2607018 -0.3268193 -0.2915730 -0.2642697
## XM_060725432.1 -0.18997614 -0.2351583 -0.3003975 -0.2656193 -0.2386788
## XM_060711350.1 -0.68289231 -0.7384974 -0.8187866 -0.7759855 -0.7428301
## XM_060736339.1 -0.06076459 -0.1025812 -0.1629610 -0.1307733 -0.1058395
##                   M-C-360    M-C-363    M-C-373     M-C-482    M-C-488
## XM_060697578.1 -0.3280375 -0.2048460 -0.2077757 -0.17126528 -0.1957894
## XM_060720708.1  5.8175241 12.8644929  9.5047181 13.91402391  3.5679465
## XM_060721026.1 -0.3995688 -0.2773383 -0.2802451 -0.24401948 -0.2683523
## XM_060725432.1 -0.3721806 -0.2515737 -0.2544419 -0.21869750 -0.2427071
## XM_060711350.1 -0.9071293 -0.7586998 -0.7622296 -0.71823936 -0.7477877
## XM_060736339.1 -0.2293972 22.3949118 -0.1204285 -0.08734659 -0.1095678
##                    M-T-18     M-T-20     M-T-22     M-T-235     M-T-245
## XM_060697578.1 -0.2633529 -0.2732816 -0.2284980 -0.14522049 -0.09994681
## XM_060720708.1  8.5216570 13.5044624  2.6323874  2.76381199  2.83749318
## XM_060721026.1 -0.3353888 -0.3452400 -0.3008058 23.44592760 -0.17325733
## XM_060725432.1 -0.3088531 -0.3185735 -0.2747295 -0.19319914 23.36847273
## XM_060711350.1  0.1069638 -0.8411556 22.4398655 -0.68685882 -0.63230999
## XM_060736339.1 -0.1707868 -0.1797831 -0.1392049 -0.06374753 -0.02272526
##                    M-T-29     M-T-31     M-T-32    M-T-399     M-T-43
## XM_060697578.1 -0.2354680 -0.2023731 -0.3045249 -0.2766036 -0.3703298
## XM_060720708.1  6.0161058  2.6730148  5.2624948 12.2341733 12.3487551
## XM_060721026.1 -0.3077214 -0.2748846 -0.3762396 -0.3485361 -0.4415312
## XM_060725432.1 -0.2815532 -0.2491526 -0.3491613 -0.3218258 -0.4135856
## XM_060711350.1 -0.7955952 -0.7557202 -0.8787996 -0.8451582 -0.9580859
## XM_060736339.1 -0.1455203 -0.1155332 -0.2080925 -0.1827932 -0.2677180
##                    M-T-44    M-T-500      M-T-7     M-T-83      M-T-8
## XM_060697578.1 -0.2709108 -0.2191125 -0.3204785 24.4642846 -0.1983103
## XM_060720708.1  3.9989270 14.4596304  4.0874992 11.9092047  2.6793835
## XM_060721026.1 -0.3428877 -0.2914935 -0.3920688 -0.3258941 -0.2708536
## XM_060725432.1 -0.3162524 -0.2655409 -0.3647802 -0.2994846 -0.2451752
## XM_060711350.1 -0.8382991 -0.7758890 -0.8980217 -0.8176631 -0.7508251
## XM_060736339.1 -0.1776349 -0.1307007 -0.2225480 -0.1621161 -0.1118520
# make the heatmap. Cluster_cols = FALSE neccessary to get the M-T and M-C columns next to eachother (defult leaves em randomized)
pheatmap(top25Counts,scale = "row",show_rownames = TRUE, treeheight_col = 1, fontsize_col = 12, fontsize_row = 5, cluster_cols = FALSE)

Sig. DEG Heatmap

commenting out this section so I can knit and folks can see the outputs

# make a heatmap using only our significant DEGs

# #import data
# DEGCounts <- read.table("../output/1213-DEG.tab", header=TRUE, quote="\"")
# head(DEGCounts)
# 
# #select subset of just gene and sample columns
# DEGHeatmap <- DEGCounts[-c(2:7)]
# head(DEGHeatmap)
# 
# # establish that the first column is row names 
# # rownames(DEGHeatmap) <- DEGHeatmap$gene
# 
# #get rid of the redundant row names column that will be created.
# # DEGHeatmap <- DEGHeatmap[,-1]
# 
# 
# ### stuck here
# ### Error in seq.default(-m, m, length.out = n + 1) :'from' must be a finite number
# 
# #make heatmap
# pheatmap(DEGHeatmap, scale = "row",show_rownames = TRUE, treeheight_col = 1, fontsize_col = 10, fontsize_row = 3)

Chunk above is giving me trouble. Here’s what I want:

  1. a data frame containing rows = genes, cols = samples, values = mean exp but ONLY for things where p <= 0.05
  2. a heatmap that then shows the genes that are significantly deferentially expressed