--- title: "3.0-RNASeq" author: Susan Garcia date: "2023-04-14" output: html_document: theme: cosmo highlight: tango toc: true toc_float: true number_sections: true code_folding: show code_download: true --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) library(kableExtra) library(DESeq2) library(pheatmap) library(RColorBrewer) library(data.table) library(DT) library(Biostrings) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` #Kallisto Assignment: Creating a workflow for RNA Seq ##Downloading reference ```{r, engine='bash'} # Setting Directory and downloading data cd ../data curl --insecure -O https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/rna.fna ``` ##Kallisto is installed on the Raven server. ```{r, engine='bash'} /home/shared/kallisto/kallisto \ index -i \ ../data/cgigas_roslin_rna.index \ ../data/rna.fna ``` ##Downloading sequence reads from the server. ```{r, engine='bash'} #Setting Directory cd ../data #Downloading fastq files using wget wget --recursive --no-parent --no-directories \ --no-check-certificate \ --accept '*.fastq.gz' \ https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/nopp/ ``` #Running Kallisto on fastq for alignment. ```{r, engine='bash'} #Creating kallisto_01 folder before alignment/run. mkdir ../output/kallisto_01 find ../data/*fastq.gz \ | xargs basename -s _L001_R1_001.fastq.gz | xargs -I{} /home/shared/kallisto/kallisto \ quant -i ../data/cgigas_roslin_rna.index \ -o ../output/kallisto_01/{} \ -t 40 \ --single -l 100 -s 10 ../data/{}_L001_R1_001.fastq.gz ``` ##COmbining output files for visualization ```{r, engine='bash'} perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \ --est_method kallisto \ --gene_trans_map none \ --out_prefix ../output/kallisto_01 \ --name_sample_by_basedir \ ../output/kallisto_01/D54_S145/abundance.tsv \ ../output/kallisto_01/D56_S136/abundance.tsv \ ../output/kallisto_01/D58_S144/abundance.tsv \ ../output/kallisto_01/M45_S140/abundance.tsv \ ../output/kallisto_01/M48_S137/abundance.tsv \ ../output/kallisto_01/M89_S138/abundance.tsv \ ../output/kallisto_01/D55_S146/abundance.tsv \ ../output/kallisto_01/D57_S143/abundance.tsv \ ../output/kallisto_01/D59_S142/abundance.tsv \ ../output/kallisto_01/M46_S141/abundance.tsv \ ../output/kallisto_01/M49_S139/abundance.tsv \ ../output/kallisto_01/M90_S147/abundance.tsv \ ../output/kallisto_01/N48_S194/abundance.tsv \ ../output/kallisto_01/N50_S187/abundance.tsv \ ../output/kallisto_01/N52_S184/abundance.tsv \ ../output/kallisto_01/N54_S193/abundance.tsv \ ../output/kallisto_01/N56_S192/abundance.tsv \ ../output/kallisto_01/N58_S195/abundance.tsv \ ../output/kallisto_01/N49_S185/abundance.tsv \ ../output/kallisto_01/N51_S186/abundance.tsv \ ../output/kallisto_01/N53_S188/abundance.tsv \ ../output/kallisto_01/N55_S190/abundance.tsv \ ../output/kallisto_01/N57_S191/abundance.tsv \ ../output/kallisto_01/N59_S189/abundance.tsv ``` #Run DESeq2 ```{r, eval=T} countmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t') rownames(countmatrix) <- countmatrix$X countmatrix <- countmatrix[,-1] head(countmatrix) ``` Rounding to whole numbers. ```{r, eval=T} countmatrix <- round(countmatrix, 0) str(countmatrix) ``` #GET DEGs BASED ON DESICATION ```{r, eval=T} deseq2.colData <- data.frame(condition=factor(c(rep("control", 12), rep("desicated", 12))), type=factor(rep("single-read", 24))) rownames(deseq2.colData) <- colnames(data) #Creates the DESeq dataframe deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix, colData = deseq2.colData, design = ~ condition) ``` ```{r, eval=T} #Runs the DESeq deseq2.dds <- DESeq(deseq2.dds) #Makes a dataframe for the results deseq2.res <- results(deseq2.dds) deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ] ``` ```{r PCA, eval=TRUE} vsd <- vst(deseq2.dds, blind = FALSE) #Plots Graph based on condition plotPCA(vsd, intgroup = "condition") ``` ```{r heatmap, eval=TRUE} #Selecting the top 50 differentially expressed genes res <- results(deseq2.dds) res_ordered <- res[order(res$padj), ] top_genes <- row.names(res_ordered)[1:50] #Extracting the counts and normalizing counts <- counts(deseq2.dds, normalized = TRUE) counts_top <- counts[top_genes, ] #Log-transform counts log_counts_top <- log2(counts_top + 1) #Generate a heat map (assure package is installed) pheatmap(log_counts_top, scale = "row") head(deseq2.res) ``` ```{r, eval=TRUE} # Count number of hits with adjusted p-value less then 0.05 dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]) ``` #Visualizing Data ```{r, eval=TRUE} temp <- deseq2.res # The main plot plot(temp$baseMean, temp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="gray", main="DEG Dessication (pval <= 0.05)", xlab="mean of normalized counts", ylab="Log2 Fold Change") # Getting the significant points and plotting them again so they're a different color temp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ] points(temp.sig$baseMean, temp.sig$log2FoldChange, pch=20, cex=0.45, col="green") # 2 FC lines abline(h=c(-1,1), col="purple") ``` #Create a new plot. ```{r, eval=TRUE} # Prepping data res_df <- as.data.frame(deseq2.res) res_df$gene <- row.names(res_df) #Volcano Plot volcano_plot <- ggplot(res_df, aes(x = log2FoldChange, y = -log10(padj), color = padj < 0.05)) + geom_point(alpha = 0.6, size = 1.5) + scale_color_manual(values = c("yellow", "red")) + labs(title = "Volcano Plot", x = "Log2 Fold Change", y = "-Log10 Adjusted P-value", color = "Significantly\nDifferentially Expressed") + theme_minimal() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.position = "top") #See your plot print(volcano_plot) ``` ```{r, eval=TRUE} #Create a table for output write.table(temp.sig, "../output/DEGlist.tab", sep = '\t', row.names = T) ``` ```{r, eval=TRUE} deglist <- read.csv("../output/DEGlist.tab", sep = '\t', header = TRUE) deglist$RowName <- rownames(deglist) deglist2 <- deglist[, c("RowName", "pvalue")] #See your plot print(deglist) #Check datable datatable(deglist) ``` # What genes are these? ## The gene enrichment analysis ```{r, eval=TRUE} gene_deg_stat <- res_df %>% mutate(degstaus = ifelse(padj < 0.05, 1, 0)) ``` ```{r, eval=TRUE} # Read the FASTA file fasta_data <- readDNAStringSet("../data/rna.fna") # Calculate gene lengths gene_lengths <- width(fasta_data) # Extract gene names/IDs from sequence IDs gene_names <- sapply(names(fasta_data), function(x) strsplit(x, " ")[[1]][1]) # Create a data frame with gene IDs and lengths gene_lengths_df <- data.frame(geneID = gene_names, length = gene_lengths) ```