### only run if packages not already installed
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("biocLite")
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse() masks IRanges::collapse()
## ✖ dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count() masks matrixStats::count()
## ✖ dplyr::desc() masks IRanges::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks S4Vectors::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename() masks S4Vectors::rename()
## ✖ lubridate::second() masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice() masks IRanges::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
options(scipen = 999) #disables printing results in scientific notation
data <- read_delim("../output/kallisto.isoform.counts.matrix")
## New names:
## Rows: 64154 Columns: 31
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): ...1 dbl (30): M-C-193, M-C-216, M-C-218, M-C-226, M-C-306, M-C-329,
## M-C-334, M-C...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
## here we want to specify our row names from out first column of data. it can be a little funky, but saving this order should work:
# create object for row names from the first column
# used the $ index because brackets were giving trouble
rownames <- data$...1
head(rownames)
## [1] "XM_060701415.1" "XM_060721439.1" "XM_060696058.1" "XR_009625421.1"
## [5] "XM_060730230.1" "XM_060707611.1"
# remove that first column now that it's values are stored
data <- data[,-1]
head(data)
## # A tibble: 6 × 30
## `M-C-193` `M-C-216` `M-C-218` `M-C-226` `M-C-306` `M-C-329` `M-C-334`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 18 19 58 16 29 28 31
## 2 0 0 0 1 0 0 0
## 3 0 0 0 0 0 0 0
## 4 286. 179. 235. 445. 122. 139. 257.
## 5 811 694 866 869 640 864 482.
## 6 663. 360. 645. 400. 182. 259. 239.
## # ℹ 23 more variables: `M-C-337` <dbl>, `M-C-339` <dbl>, `M-C-358` <dbl>,
## # `M-C-360` <dbl>, `M-C-363` <dbl>, `M-C-373` <dbl>, `M-C-482` <dbl>,
## # `M-C-488` <dbl>, `M-T-18` <dbl>, `M-T-20` <dbl>, `M-T-22` <dbl>,
## # `M-T-235` <dbl>, `M-T-245` <dbl>, `M-T-29` <dbl>, `M-T-31` <dbl>,
## # `M-T-32` <dbl>, `M-T-399` <dbl>, `M-T-43` <dbl>, `M-T-44` <dbl>,
## # `M-T-500` <dbl>, `M-T-7` <dbl>, `M-T-83` <dbl>, `M-T-8` <dbl>
#ensures all data as integer
data <- round(data, digits = 0)
head(data)
## # A tibble: 6 × 30
## `M-C-193` `M-C-216` `M-C-218` `M-C-226` `M-C-306` `M-C-329` `M-C-334`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 18 19 58 16 29 28 31
## 2 0 0 0 1 0 0 0
## 3 0 0 0 0 0 0 0
## 4 286 179 235 445 122 139 257
## 5 811 694 866 869 640 864 482
## 6 663 360 645 400 182 259 239
## # ℹ 23 more variables: `M-C-337` <dbl>, `M-C-339` <dbl>, `M-C-358` <dbl>,
## # `M-C-360` <dbl>, `M-C-363` <dbl>, `M-C-373` <dbl>, `M-C-482` <dbl>,
## # `M-C-488` <dbl>, `M-T-18` <dbl>, `M-T-20` <dbl>, `M-T-22` <dbl>,
## # `M-T-235` <dbl>, `M-T-245` <dbl>, `M-T-29` <dbl>, `M-T-31` <dbl>,
## # `M-T-32` <dbl>, `M-T-399` <dbl>, `M-T-43` <dbl>, `M-T-44` <dbl>,
## # `M-T-500` <dbl>, `M-T-7` <dbl>, `M-T-83` <dbl>, `M-T-8` <dbl>
# assign the row names to the data using the object created
rownames(data) <- rownames
## Warning: Setting row names on a tibble is deprecated.
head(data)
## # A tibble: 6 × 30
## `M-C-193` `M-C-216` `M-C-218` `M-C-226` `M-C-306` `M-C-329` `M-C-334`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 18 19 58 16 29 28 31
## 2 0 0 0 1 0 0 0
## 3 0 0 0 0 0 0 0
## 4 286 179 235 445 122 139 257
## 5 811 694 866 869 640 864 482
## 6 663 360 645 400 182 259 239
## # ℹ 23 more variables: `M-C-337` <dbl>, `M-C-339` <dbl>, `M-C-358` <dbl>,
## # `M-C-360` <dbl>, `M-C-363` <dbl>, `M-C-373` <dbl>, `M-C-482` <dbl>,
## # `M-C-488` <dbl>, `M-T-18` <dbl>, `M-T-20` <dbl>, `M-T-22` <dbl>,
## # `M-T-235` <dbl>, `M-T-245` <dbl>, `M-T-29` <dbl>, `M-T-31` <dbl>,
## # `M-T-32` <dbl>, `M-T-399` <dbl>, `M-T-43` <dbl>, `M-T-44` <dbl>,
## # `M-T-500` <dbl>, `M-T-7` <dbl>, `M-T-83` <dbl>, `M-T-8` <dbl>
deseq2.colData <- data.frame(condition = factor(c(rep("Control", 15), rep("Treatment", 15))), type = factor(rep("paired-read", 30)))
rownames(deseq2.colData) <- colnames(data)
deseq2.dds <- DESeqDataSetFromMatrix(countData = data,
colData = deseq2.colData,
design = ~ condition)
## converting counts to integer mode
deseq2.dds <- DESeq(deseq2.dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 4515 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
deseq2.res <- results(deseq2.dds)
deseq2.res <- deseq2.res[order(rownames(deseq2.res)),]
summary(deseq2.res)
##
## out of 55171 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 59, 0.11%
## LFC < 0 (down) : 76, 0.14%
## outliers [1] : 0, 0%
## low counts [2] : 19624, 36%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
DEG <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
tmp <- deseq2.res
write.table(DEG,"../output/1213-DEG.tab", row.names = T)
plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-10, 10), log="x", col="darkgray",
main="Treatmetn versus Control (pval <= 0.05)",
xlab="mean of normalized counts",
ylab="Log2 Fold Change")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 8983 x values <= 0 omitted
## from logarithmic plot
# Getting the significant points and plotting them again so they're a different color
tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")
#load libraries for heatmap
library("RColorBrewer")
# library( "genefilter" )
library("pheatmap")
#Regularized log transformation
rld <- rlog( deseq2.dds )
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
#Get 25 top varying genes
topVarGenes <- head( order( rowVars (x = assay(rld), useNames = FALSE ), decreasing=TRUE ), 25)
# make heatmap of top 25 differentially expressed genes
# get top 25 counts
top25Counts<-assay(rld)[topVarGenes,]
head(top25Counts)
## M-C-193 M-C-216 M-C-218 M-C-226 M-C-306
## XM_060697578.1 -0.2396969 -0.18519590 -0.15532217 -0.3389453 -0.18122668
## XM_060720708.1 2.6151479 13.50027330 2.74757874 2.4672732 2.70630972
## XM_060721026.1 -0.3119173 -0.25784144 -0.22820074 -0.4103915 -0.25390318
## XM_060725432.1 -0.2856934 -0.23233585 -0.20308889 -0.3828595 -0.22844991
## XM_060711350.1 -0.8006905 -0.73502393 -0.69903002 -0.9202717 -0.73024154
## XM_060736339.1 -0.1493521 -0.09996906 -0.07290061 -0.2392807 -0.09637257
## M-C-329 M-C-334 M-C-337 M-C-339 M-C-358
## XM_060697578.1 -0.14192841 -0.1880788 -0.2547161 -0.2191926 -0.1916747
## XM_060720708.1 13.86004624 11.9786052 8.3851204 8.3194180 4.9329131
## XM_060721026.1 -0.21491146 -0.2607018 -0.3268193 -0.2915730 -0.2642697
## XM_060725432.1 -0.18997614 -0.2351583 -0.3003975 -0.2656193 -0.2386788
## XM_060711350.1 -0.68289231 -0.7384974 -0.8187866 -0.7759855 -0.7428301
## XM_060736339.1 -0.06076459 -0.1025812 -0.1629610 -0.1307733 -0.1058395
## M-C-360 M-C-363 M-C-373 M-C-482 M-C-488
## XM_060697578.1 -0.3280375 -0.2048460 -0.2077757 -0.17126528 -0.1957894
## XM_060720708.1 5.8175241 12.8644929 9.5047181 13.91402391 3.5679465
## XM_060721026.1 -0.3995688 -0.2773383 -0.2802451 -0.24401948 -0.2683523
## XM_060725432.1 -0.3721806 -0.2515737 -0.2544419 -0.21869750 -0.2427071
## XM_060711350.1 -0.9071293 -0.7586998 -0.7622296 -0.71823936 -0.7477877
## XM_060736339.1 -0.2293972 22.3949118 -0.1204285 -0.08734659 -0.1095678
## M-T-18 M-T-20 M-T-22 M-T-235 M-T-245
## XM_060697578.1 -0.2633529 -0.2732816 -0.2284980 -0.14522049 -0.09994681
## XM_060720708.1 8.5216570 13.5044624 2.6323874 2.76381199 2.83749318
## XM_060721026.1 -0.3353888 -0.3452400 -0.3008058 23.44592760 -0.17325733
## XM_060725432.1 -0.3088531 -0.3185735 -0.2747295 -0.19319914 23.36847273
## XM_060711350.1 0.1069638 -0.8411556 22.4398655 -0.68685882 -0.63230999
## XM_060736339.1 -0.1707868 -0.1797831 -0.1392049 -0.06374753 -0.02272526
## M-T-29 M-T-31 M-T-32 M-T-399 M-T-43
## XM_060697578.1 -0.2354680 -0.2023731 -0.3045249 -0.2766036 -0.3703298
## XM_060720708.1 6.0161058 2.6730148 5.2624948 12.2341733 12.3487551
## XM_060721026.1 -0.3077214 -0.2748846 -0.3762396 -0.3485361 -0.4415312
## XM_060725432.1 -0.2815532 -0.2491526 -0.3491613 -0.3218258 -0.4135856
## XM_060711350.1 -0.7955952 -0.7557202 -0.8787996 -0.8451582 -0.9580859
## XM_060736339.1 -0.1455203 -0.1155332 -0.2080925 -0.1827932 -0.2677180
## M-T-44 M-T-500 M-T-7 M-T-83 M-T-8
## XM_060697578.1 -0.2709108 -0.2191125 -0.3204785 24.4642846 -0.1983103
## XM_060720708.1 3.9989270 14.4596304 4.0874992 11.9092047 2.6793835
## XM_060721026.1 -0.3428877 -0.2914935 -0.3920688 -0.3258941 -0.2708536
## XM_060725432.1 -0.3162524 -0.2655409 -0.3647802 -0.2994846 -0.2451752
## XM_060711350.1 -0.8382991 -0.7758890 -0.8980217 -0.8176631 -0.7508251
## XM_060736339.1 -0.1776349 -0.1307007 -0.2225480 -0.1621161 -0.1118520
# make the heatmap. Cluster_cols = FALSE neccessary to get the M-T and M-C columns next to eachother (defult leaves em randomized)
pheatmap(top25Counts,scale = "row",show_rownames = TRUE, treeheight_col = 1, fontsize_col = 12, fontsize_row = 5, cluster_cols = FALSE)
commenting out this section so I can knit and folks can see the outputs
# make a heatmap using only our significant DEGs
# #import data
# DEGCounts <- read.table("../output/1213-DEG.tab", header=TRUE, quote="\"")
# head(DEGCounts)
#
# #select subset of just gene and sample columns
# DEGHeatmap <- DEGCounts[-c(2:7)]
# head(DEGHeatmap)
#
# # establish that the first column is row names
# # rownames(DEGHeatmap) <- DEGHeatmap$gene
#
# #get rid of the redundant row names column that will be created.
# # DEGHeatmap <- DEGHeatmap[,-1]
#
#
# ### stuck here
# ### Error in seq.default(-m, m, length.out = n + 1) :'from' must be a finite number
#
# #make heatmap
# pheatmap(DEGHeatmap, scale = "row",show_rownames = TRUE, treeheight_col = 1, fontsize_col = 10, fontsize_row = 3)
Chunk above is giving me trouble. Here’s what I want: