--- title: "RNA-seq Data Retrieval & Kallisto Evaluation" author: Chris date: "`r format(Sys.time(), '%d %B, %Y')`" output: html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true editor: markdown: wrap: 72 --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) library(kableExtra) library(DESeq2) library(pheatmap) library(RColorBrewer) library(data.table) library(DT) library(Biostrings) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` **Data**\ Existing RNA-Seq data was retrieved from the following complete RNASeq data available in the [NCBI database](https://www.ncbi.nlm.nih.gov/):\ - SRR19782039 [Exposure to Valsartan & Carbamazepine](https://www.ncbi.nlm.nih.gov/sra/SRX15826291%5Baccn%5D)\ - SRR16771870 [Exposure to a synthetic hormone 17 a-Ethinylestradiol (EE2)](https://www.ncbi.nlm.nih.gov/sra/SRX12971792%5Baccn%5D)\ - SRR7725722 [Diarrhetic Shellfish Poisoning (DSP) toxins associated with Harmful Algal Blooms (HABs)](https://www.ncbi.nlm.nih.gov/sra/SRX4582204%5Baccn%5D)\ - SRR13013756 [Hypoxia](https://www.ncbi.nlm.nih.gov/sra/SRX9464766%5Baccn%5D)\ - *Mytilus galloprovincialis* [Reference Genome](https://www.ncbi.nlm.nih.gov/data-hub/genome/GCA_900618805.1/) ```{r} getwd() ``` ### Pulling my reference genome from prefab code provided by NCBI curled genome and copied just cds file ```{r, engine='bash', eval=TRUE} cd /home/shared/8TB_HDD_02/cnmntgna/GitHub/chris-musselcon/data/ncbi_dataset/data/GCA_900618805.1 head cds_from_genomic.fna ``` # Create the index file to align my short read files to the genes from the MGAL_10 ```{r, engine='bash'} /home/shared/kallisto/kallisto \ index \ -i ../data/MGAL_cds.index \ ../data/ncbi_dataset/data/GCA_900618805.1/cds_from_genomic.fna ``` ```{bash} head ../data/ncbi_dataset/data/GCA_900618805.1/cds_from_genomic.fna ``` # Where is data #Runnning Kallisto ```{r, engine='bash', eval=TRUE} /home/shared/kallisto/kallisto quant ``` ```{r, engine='bash', eval=TRUE} ls /home/shared/8TB_HDD_02/cnmntgna/GitHub/chris-musselcon/output/ncbi/ ``` ```{r, engine='bash'} #mkdir ../output mkdir ../output/kallisto_01 #OLD CODE find /home/shared/8TB_HDD_02/cnmntgna/GitHub/chris-musselcon/output/ncbi/*_1.fastq \ | xargs basename -s _1.fastq | xargs -I{} /home/shared/kallisto/kallisto \ quant -i ../data/MGAL_cds.index \ -o ../output/kallisto_01/{} \ -t 4 \ /home/shared/8TB_HDD_02/cnmntgna/GitHub/chris-musselcon/output/ncbi/{}_1.fastq \ /home/shared/8TB_HDD_02/cnmntgna/GitHub/chris-musselcon/output/ncbi/{}_2.fastq \ ``` Getting length mean & SD for the single read sequences in the .fastq files. Code taken from: https://www.biostars.org/p/243552/ ```{bash} awk 'BEGIN { t=0.0;sq=0.0; n=0;} ;NR%4==2 {n++;L=length($0);t+=L;sq+=L*L;}END{m=t/n;printf("total %d avg=%f stddev=%f\n",n,m,sq/n-m*m);}' /home/shared/8TB_HDD_02/cnmntgna/GitHub/chris-musselcon/output/ncbi/*.fastq ``` Grabbing only the fastq files ```{r, eval=FALSE, engine='bash'} #Running for .fastq to pick up the other two, since i missed that not all had a 1 or 2 #how do i get *.fastq files as well? I thought I understood the syntax, but I am still getting it wrong find /home/shared/8TB_HDD_02/cnmntgna/GitHub/chris-musselcon/output/ncbi/*.fastq \ | xargs basename -s .fastq | xargs -I{} /home/shared/kallisto/kallisto \ quant -i ../data/MGAL_cds.index \ --single -l 92 -s 405 \ -o ../output/kallisto_01/{} \ -t 4 \ /home/shared/8TB_HDD_02/cnmntgna/GitHub/chris-musselcon/output/ncbi/{}.fastq ``` #DESeq from Kallisto output #trouble calling the program from the whole chunk below so I separated it to make sure I'm going to the right place. ```{r} getwd() ``` ```{r, engine='bash'} cd /home/shared/trinityrnaseq-v2.12.0 ``` #Kallisto ```{r, engine='bash'} perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \ --est_method kallisto \ --gene_trans_map none \ --out_prefix /home/shared/8TB_HDD_02/cnmntgna/GitHub/chris-musselcon/output/kallisto_01 \ --name_sample_by_basedir \ /home/shared/8TB_HDD_02/cnmntgna/GitHub/chris-musselcon/output/kallisto_01/SRR7725722/abundance.tsv \ /home/shared/8TB_HDD_02/cnmntgna/GitHub/chris-musselcon/output/kallisto_01/SRR7725722_1/abundance.tsv \ /home/shared/8TB_HDD_02/cnmntgna/GitHub/chris-musselcon/output/kallisto_01/SRR7725722_2/abundance.tsv \ /home/shared/8TB_HDD_02/cnmntgna/GitHub/chris-musselcon/output/kallisto_01/SRR13013756/abundance.tsv \ /home/shared/8TB_HDD_02/cnmntgna/GitHub/chris-musselcon/output/kallisto_01/SRR16771870/abundance.tsv \ /home/shared/8TB_HDD_02/cnmntgna/GitHub/chris-musselcon/output/kallisto_01/SRR16771870_1/abundance.tsv \ /home/shared/8TB_HDD_02/cnmntgna/GitHub/chris-musselcon/output/kallisto_01/SRR16771870_2/abundance.tsv \ /home/shared/8TB_HDD_02/cnmntgna/GitHub/chris-musselcon/output/kallisto_01/SRR19782039/abundance.tsv ``` ```{r} countmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t') rownames(countmatrix) <- countmatrix$X countmatrix <- countmatrix[,-1] head(countmatrix) #tail(countmatrix) #dim(countmatrix) ``` ```{r} countmatrix <- round(countmatrix, 0) str(countmatrix) ``` ```{r} dim(countmatrix) #dim(deseq2.colData) ``` ```{r} length(colnames(data)) ``` ```{r} #compare different manually - compare the synthetic estrogens versus the dsp - 3x3 and drop the valsartan deseq2.colData <- data.frame(condition=factor(c("DSP", "DSP", "Hypoxia", "EE2", "EE2", "EE2", "Valsartan")), type=factor(rep("single-read", 7))) rownames(deseq2.colData) <- colnames(data) dim(deseq2.colData) deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix, colData = deseq2.colData, design = ~ condition) ``` ```{r} deseq2.dds <- DESeq(deseq2.dds) deseq2.res <- results(deseq2.dds) deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ] head(deseq2.dds) ``` ```{r} vsd <- vst(deseq2.dds, blind = FALSE) plotPCA(vsd, intgroup = "condition") ``` ```{r} # Select top 50 differentially expressed genes - is it valuable to select more than 50? res <- results(deseq2.dds) res_ordered <- res[order(res$padj), ] top_genes <- row.names(res_ordered)[1:50] # Extract counts and normalize counts <- counts(deseq2.dds, normalized = TRUE) counts_top <- counts[top_genes, ] # Log-transform counts log_counts_top <- log2(counts_top + 1) # Generate heatmap pheatmap(log_counts_top, scale = "row") ``` ```{r} #what are we doing here and how do i get it to compare the others against each other? head(deseq2.res) ``` ```{r} # Count number of hits with adjusted p-value less then 0.05 dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]) ``` ```{r} tmp <- deseq2.res # The main plot plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray", main="DEG Valsartan vs DSP (pval <= 0.05)", xlab="mean of normalized counts", ylab="Log2 Fold Change") # Getting the significant points and plotting them again so they're a different color tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ] points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red") # 2 FC lines abline(h=c(-1,1), col="blue") ``` ```{r} # Prepare the data for plotting res_df <- as.data.frame(deseq2.res) res_df$gene <- row.names(res_df) # Create volcano plot volcano_plot <- ggplot(res_df, aes(x = log2FoldChange, y = -log10(padj), color = padj < 0.05)) + geom_point(alpha = 0.6, size = 1.5) + scale_color_manual(values = c("grey", "red")) + labs(title = "Volcano Plot", x = "Log2 Fold Change", y = "-Log10 Adjusted P-value", color = "Significantly\nDifferentially Expressed") + theme_minimal() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.position = "top") print(volcano_plot) ``` ```{r} write.table(tmp.sig, "../output/DEGlist.tab", sep = '\t', row.names = T) ``` ```{r} deglist <- read.csv("../output/DEGlist.tab", sep = '\t', header = TRUE) deglist$RowName <- rownames(deglist) deglist2 <- deglist[, c("RowName", "pvalue")] # Optionally, reorder the columns ``` ```{r} datatable(deglist) ``` #Annotation \## BLAST to uniprot, use HISAT? \### Enrichr install/ call ```{r} install.packages("devtools") library(devtools) install_github("wjawaid/enrichR") library("enrichR") listEnrichrSites() dbs <- listEnrichrDbs() head(dbs) ``` ```{r} ``` # Steven's code #Need to modify for my files, just copying code in first ```{r} cg_sp <- read.csv("https://raw.githubusercontent.com/sr320/nb-2022/main/C_gigas/analyses/CgR-blastp-sp.tab", header = FALSE, sep="\t") %>% distinct(V1, .keep_all = TRUE) ``` ```{r} loc <- read.csv("https://raw.githubusercontent.com/sr320/nb-2022/main/C_gigas/analyses/LOC_Acc.tab", sep = " ", header = FALSE) ``` ```{r} comb <- left_join(loc, cg_sp, by = c("V2" = "V1")) %>% left_join(deglist, by = c("V1" = "RowName")) ``` #Same as annotation process #Gene enrichment analysis ```{r} gene_deg_status <- res_df %>% mutate(degstaus = ifelse(padj < 0.05, 1, 0)) ``` ```{r} # Read the FASTA file fasta_data <- readDNAStringSet("/home/shared/8TB_HDD_02/cnmntgna/GitHub/chris-musselcon/data/ncbi_dataset/data/GCA_900618805.1/cds_from_genomic.fna") # Calculate gene lengths gene_lengths <- width(fasta_data) # Extract gene names/IDs from sequence IDs gene_names <- sapply(names(fasta_data), function(x) strsplit(x, " ")[[1]][1]) # Create a data frame with gene IDs and lengths gene_lengths_df <- data.frame(geneID = gene_names, length = gene_lengths) ``` #Same as annotation process #Need GO mappings #only run to install packages ```{r} # have to install the package that allows me to call nullp install.packages("remotes") remotes::install_github("nadiadavidson/goseq") library("goseq") ``` ```{r} pwf <- nullp(gene_data, bias.data = gene_lengths) GO_analysis <- goseq(pwf, gene2cat = "your_organism_GO_mapping") ```