--- title: "Differential gene expression analysis & PCA using Kallisto" output: md_document date: "2023-04-22" --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) library(DESeq2) library(pheatmap) library(RColorBrewer) library(data.table) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Setting this to false will allow us to knit the code without evaluating code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` # Accessing a reference to produce an index Download the 99k mRNA "complete records" for *Botryllus schlosseri* from NCBI as FASTA file and store in data folder. Then run the following code chunk which will use the index command in Kallisto to create a reference index of all 99k records. It renames that fasta file you upload to 'bsc_rna.index'. ```{r, engine = 'bash'} /home/shared/kallisto/kallisto \ index -i \ /home/shared/8TB_HDD_02/cvaldi/celeste-tunicate-devo/data/bsc_rna.index \ /home/shared/8TB_HDD_02/cvaldi/celeste-tunicate-devo/data/sequence.fasta ``` Running the following code chunks produces a bunch of errors regarding file pathways. I think I am running into issues because my RNAseq data is in a different hard drive than my current working directory. However, If I simplify the code chunk to ask it for just one of my files (thus skipping the find command). It's possible to run the Kallisto quant command. Eventually, I may come back to this code chunk to troubleshoot. ```{r, engine = 'bash'} find /home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/*_1.fastq \ | sed 's/_1.fastq$//' \ | xargs -I{} /home/shared/kallisto/kallisto \ quant -i /home/shared/8TB_HDD_02/cvaldi/celeste-tunicate-devo/data/bsc_rna.index \ -o /home/shared/8TB_HDD_02/cvaldi/celeste-tunicate-devo/output/kallisto_01/{} \ -t 40 \ /home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/{}_1.fastq /home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/{}_2.fastq ``` # Conducting psueo-alignments one query at a time... ### Quant SRR10352205 ```{r, engine = 'bash'} /home/shared/kallisto/kallisto quant -i ../data/bsc_rna.index \ -o ../output/kallisto_01/SRR10352205 \ -t 40 \ /home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/SRR10352205_1.fastq \ /home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/SRR10352205_2.fastq ``` ### Quant SRR10352206 ```{r, engine = 'bash'} /home/shared/kallisto/kallisto quant -i ../data/bsc_rna.index \ -o ../output/kallisto_01/SRR10352206 \ -t 40 \ /home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/SRR10352206_1.fastq \ /home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/SRR10352206_2.fastq ``` ### Quant SRR10352224 ```{r, engine = 'bash'} /home/shared/kallisto/kallisto quant -i ../data/bsc_rna.index \ -o ../output/kallisto_01/SRR10352224 \ -t 40 \ /home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/SRR10352224_1.fastq \ /home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/SRR10352224_2.fastq ``` ### Quant SRR10352254 ```{r, engine = 'bash'} /home/shared/kallisto/kallisto quant -i ../data/bsc_rna.index \ -o ../output/kallisto_01/SRR10352254 \ -t 40 \ /home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/SRR10352254_1.fastq \ /home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/SRR10352254_2.fastq ``` # Creating gene expression matracies from kallisto output files This command runs the abundance_estimates_to_matrix.pl script from the Trinity RNA-seq assembly software package to create a gene expression matrix from kallisto output files. ```{r, engine = 'bash'} perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \ --est_method kallisto \ --gene_trans_map none \ --out_prefix ../output/kallisto_01 \ --name_sample_by_basedir \ ../output/kallisto_01/SRR10352205/abundance.tsv \ ../output/kallisto_01/SRR10352206/abundance.tsv \ ../output/kallisto_01/SRR10352224/abundance.tsv \ ../output/kallisto_01/SRR10352254/abundance.tsv ``` # Using DESeq2 ```{r} countmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t') rownames(countmatrix) <- countmatrix$X #this line renames the row names from 1-n to the value of the first column called x countmatrix <- countmatrix[,-1] #this gets rid of the first column countmatrix <- round(countmatrix, 0) #rounds values in count matrix head(countmatrix) ``` The following code chunk will make a new data frame that will contain information about the treatment groups and the type of RNAseq data produced. This will be called the object "deseq2.colData". We then use the DESeqDataSetFromMatrix function from the DESeq2 package ```{r} deseq2.colData <- data.frame(condition=factor(c(rep("b5-b6", 2), rep("TO", 2))), type=factor(rep("paired-end", 4))) rownames(deseq2.colData) <- colnames(data) #not sure what the purpose of this line is, it doesn't seem to do anything deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix, colData = deseq2.colData, design = ~ condition) ``` ```{r} deseq2.dds <- DESeq(deseq2.dds) #takes the DESeq data set we created and conducts a DESeq analysis deseq2.res <- results(deseq2.dds) #here we take a look at the results of the analysis deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ] head(deseq2.res) ``` ```{r} # Count number of hits with adjusted p-value less then 0.05 dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]) ``` ```{r} tmp <- deseq2.res # The main plot plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray", main="DEG Dessication (pval <= 0.05)", xlab="mean of normalized counts", ylab="Log2 Fold Change") # Getting the significant points and plotting them again so they're a different color tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ] points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red") # 2 FC lines abline(h=c(-1,1), col="blue") ``` PCA plot: ```{r} vsd <- vst(deseq2.dds, blind = FALSE) plotPCA(vsd, intgroup = "condition") ``` ```{r} # Select top 50 differentially expressed genes res <- results(deseq2.dds) res_ordered <- res[order(res$padj), ] top_genes <- row.names(res_ordered)[1:50] # Extract counts and normalize counts <- counts(deseq2.dds, normalized = TRUE) counts_top <- counts[top_genes, ] # Log-transform counts log_counts_top <- log2(counts_top + 1) # Generate heatmap pheatmap(log_counts_top, scale = "row") ``` ```{r} # Prepare the data for plotting res_df <- as.data.frame(deseq2.res) res_df$gene <- row.names(res_df) # Create volcano plot volcano_plot <- ggplot(res_df, aes(x = log2FoldChange, y = -log10(padj), color = padj < 0.05)) + geom_point(alpha = 0.6, size = 1.5) + scale_color_manual(values = c("grey", "red")) + labs(title = "Volcano Plot", x = "Log2 Fold Change", y = "-Log10 Adjusted P-value", color = "Significantly\nDifferentially Expressed") + theme_minimal() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.position = "top") print(volcano_plot) ``` # Improving look of initial exploratory plots Plotting only the significant points on deg: ```{r} mp <- deseq2.res sigPoints <- !is.na(mp$padj) & mp$padj <= 0.05 # Define color palette colorPalette <- c(gray(seq(0.8, 0.2, length.out = 100)), "red") # The main plot plot(mp$baseMean[sigPoints], mp$log2FoldChange[sigPoints], pch=10, cex=0.45, ylim=c(-3, 3), log="x", col=colorPalette[1], main="Differentially Expressed Genes in Blastogenic Stage (pval <= 0.05)", xlab="Mean Expression Level", ylab="Log2 Fold Change (vs Control)", cex.lab=1.5, cex.main=2) # Getting the significant points and plotting them again so they're a different color points(mp$baseMean[sigPoints], mp$log2FoldChange[sigPoints], pch=20, cex=0.45, col=colorPalette[length(colorPalette)]) # 2 FC lines abline(h=c(-1,1), col="blue") # Add legend ``` ```{r} write.table(tmp.sig, "../output/DEGlist.tab", row.names = T) ```