--- title: "Differential gene expression analysis using Kallisto" output: md_document date: "2023-04-22" --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) library(DESeq2) library(pheatmap) library(RColorBrewer) library(data.table) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Setting this to false will allow us to knit the code without evaluating code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` # Accessing a reference to produce an index Download the 99k mRNA "complete records" for *Botryllus schlosseri* from NCBI as FASTA file and store in data folder. Note that I did not curl this information but had to manually download it. Then run the following code chunk which will use the index command in Kallisto to create a reference index of all 99k records. It renames that fasta file you upload to 'bsc_rna.index'. I apologize for the absolute paths used to name my new index and to id where the fasta files is... just replace those as need be. ```{r, engine = 'bash'} /home/shared/kallisto/kallisto \ index -i \ /home/shared/8TB_HDD_02/cvaldi/celeste-tunicate-devo/data/bsc_rna.index \ /home/shared/8TB_HDD_02/cvaldi/celeste-tunicate-devo/data/sequence.fasta ``` # Psuedo-alginment of all queries in your raw data directory Below, I am setting an input directory where my raw RNAseq files live and an index_file defining where the index we just created is. I then specify where I want kallisto to store the output files from this alignment (a pre-made folder called kallisto_01 in the output folder). I then have a loop that will go through all the paired-end fastq files in the input directory that will use the quant command in kallisto and quantify abundance of transcripts for each pair of fastq files. It takes the base name of the inut file and names the new folder it creates for one query that same base name. ```{r, engine='bash'} #!/bin/bash # Set input directory and index file input_dir=/home/shared/8TB_HDD_01/cvaldi/tunicate-fish546/raw/ index_file=../data/psuedo-alignment/bsc_rna.index # Set output directory output_dir=../output/kallisto_01 # Set number of threads threads=10 # Loop through all paired-end fastq files in input directory for f in ${input_dir}/*_1.fastq; do # Get base filename base=$(basename ${f} _1.fastq) # Run kallisto to quantify transcripts /home/shared/kallisto/kallisto quant -i ${index_file} \ -o ${output_dir}/${base} \ -t ${threads} \ ${input_dir}/${base}_1.fastq ${input_dir}/${base}_2.fastq done ``` # Creating gene expression matracies from kallisto output files This command runs the abundance_estimates_to_matrix.pl script from the Trinity RNA-seq assembly software package to create a gene expression matrix from kallisto output files. ```{r, engine = 'bash'} perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \ --est_method kallisto \ --gene_trans_map none \ --out_prefix ../output/kallisto_01 \ --name_sample_by_basedir \ ../output/kallisto_01/SRR10352205/abundance.tsv \ ../output/kallisto_01/SRR10352206/abundance.tsv \ ../output/kallisto_01/SRR10352224/abundance.tsv \ ../output/kallisto_01/SRR10352254/abundance.tsv ``` # Using DESeq2 ```{r} countmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t') rownames(countmatrix) <- countmatrix$X #this line renames the row names from 1-n to the value of the first column called x countmatrix <- countmatrix[,-1] #this gets rid of the first column countmatrix <- round(countmatrix, 0) #rounds values in count matrix head(countmatrix) ``` The following code chunk will make a new data frame that will contain information about the treatment groups and the type of RNAseq data produced. This will be called the object "deseq2.colData". We then use the DESeqDataSetFromMatrix function from the DESeq2 package ```{r} deseq2.colData <- data.frame(condition=factor(c(rep("b5-b6", 2), rep("TO", 2))), type=factor(rep("paired-end", 4))) rownames(deseq2.colData) <- colnames(data) #not sure what the purpose of this line is, it doesn't seem to do anything deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix, colData = deseq2.colData, design = ~ condition) ``` ```{r} deseq2.dds <- DESeq(deseq2.dds) #takes the DESeq data set we created and conducts a DESeq analysis deseq2.res <- results(deseq2.dds) #here we take a look at the results of the analysis deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ] head(deseq2.res) ``` ```{r} # Count number of hits with adjusted p-value less then 0.05 dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]) ``` ```{r} tmp <- deseq2.res # The main plot plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray", main="DEG Dessication (pval <= 0.05)", xlab="mean of normalized counts", ylab="Log2 Fold Change") # Getting the significant points and plotting them again so they're a different color tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ] points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red") # 2 FC lines abline(h=c(-1,1), col="blue") ``` PCA plot: ```{r} vsd <- vst(deseq2.dds, blind = FALSE) plotPCA(vsd, intgroup = "condition") ``` ```{r} # Select top 50 differentially expressed genes res <- results(deseq2.dds) res_ordered <- res[order(res$padj), ] top_genes <- row.names(res_ordered)[1:50] # Extract counts and normalize counts <- counts(deseq2.dds, normalized = TRUE) counts_top <- counts[top_genes, ] # Log-transform counts log_counts_top <- log2(counts_top + 1) # Generate heatmap pheatmap(log_counts_top, scale = "row") ``` ```{r} # Prepare the data for plotting res_df <- as.data.frame(deseq2.res) res_df$gene <- row.names(res_df) # Create volcano plot volcano_plot <- ggplot(res_df, aes(x = log2FoldChange, y = -log10(padj), color = padj < 0.05)) + geom_point(alpha = 0.6, size = 1.5) + scale_color_manual(values = c("grey", "red")) + labs(title = "Volcano Plot", x = "Log2 Fold Change", y = "-Log10 Adjusted P-value", color = "Significantly\nDifferentially Expressed") + theme_minimal() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.position = "top") print(volcano_plot) ``` # Improving look of initial exploratory plots Plotting only the significant points on deg: ```{r} mp <- deseq2.res sigPoints <- !is.na(mp$padj) & mp$padj <= 0.05 # Define color palette colorPalette <- c(gray(seq(0.8, 0.2, length.out = 100)), "red") # The main plot plot(mp$baseMean[sigPoints], mp$log2FoldChange[sigPoints], pch=10, cex=0.45, ylim=c(-3, 3), log="x", col=colorPalette[1], main="Differentially Expressed Genes in Blastogenic Stage (pval <= 0.05)", xlab="Mean Expression Level", ylab="Log2 Fold Change (vs Control)", cex.lab=1.5, cex.main=2) # Getting the significant points and plotting them again so they're a different color points(mp$baseMean[sigPoints], mp$log2FoldChange[sigPoints], pch=20, cex=0.45, col=colorPalette[length(colorPalette)]) # 2 FC lines abline(h=c(-1,1), col="blue") # Add legend ``` ```{r} write.table(tmp.sig, "../output/DEGlist.tab", row.names = T) ```