--- title: "Kallisto Pseudo-alignment" author: Steven Roberts date: "`r format(Sys.time(), '%d %B, %Y')`" output: github_document: toc: true toc_depth: 3 number_sections: true html_preview: true html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) library(kableExtra) library(DESeq2) library(pheatmap) library(RColorBrewer) library(data.table) library(DT) library(formattable) library(Biostrings) library(spaa) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` # Download tag-seq data ```{bash} wget -r \ --no-directories --no-parent \ -P ../data \ -A .fastq.gz https://gannet.fish.washington.edu/panopea/PSMFC-mytilus-byssus-pilot/20220405-tagseq/ \ --no-check-certificate ``` # get transcriptome ```{r, engine='bash'} cd ../data curl -O https://owl.fish.washington.edu/halfshell/genomic-databank/Mtros-hq_transcripts.fasta ``` # Create indices for transcriptome with Kallisto ```{bash} # Build index /home/shared/kallisto_linux-v0.50.1/kallisto \ index \ -t 20 \ -i ../data/Mtros-hq_transcripts.index \ ../data/Mtros-hq_transcripts.fasta ``` # Kallisto quantification ```{bash} # Set the paths DATA_DIRECTORY="../data" KALLISTO_INDEX="../data/Mtros-hq_transcripts.index" OUTPUT_DIRECTORY="../analyses/07-kallisto" # Iterate over all .fq.gz files in the data directory for FILE in "$DATA_DIRECTORY"/*.fastq.gz; do # Extract the base name of the file for naming the output folder BASENAME=$(basename "$FILE" _R1_001.fastq.gz) # Create output directory for this sample SAMPLE_OUTPUT="$OUTPUT_DIRECTORY/$BASENAME" mkdir -p "$SAMPLE_OUTPUT" # Run Kallisto quantification /home/shared/kallisto_linux-v0.50.1/kallisto quant -i "$KALLISTO_INDEX" -o "$SAMPLE_OUTPUT" \ --single -t 20 -l 65 -s 2 "$FILE" done echo "Kallisto quantification complete." ``` # creating count matrix ```{bash} perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \ --est_method kallisto \ --gene_trans_map none \ --out_prefix ../analyses/07-kallisto/test \ --name_sample_by_basedir \ ../analyses/07-kallisto/*/abundance.tsv ``` ```{r, engine='bash', eval=TRUE} head ../analyses/07-kallisto/test.isoform.counts.matrix ``` ```{r, eval=TRUE} countmatrix <- read.delim("../analyses/07-kallisto/test.isoform.counts.matrix", header = TRUE, sep = '\t') rownames(countmatrix) <- countmatrix$X countmatrix <- countmatrix[,-1] head(countmatrix) ``` ```{r, eval=TRUE} countmatrix <- round(countmatrix, 0) str(countmatrix) ``` ```{r, eval=TRUE} deseq2.colData <- data.frame(condition=factor(c(rep("control", 34), rep("desicated", 34))), type=factor(rep("single-read", 68))) rownames(deseq2.colData) <- colnames(data) deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix, colData = deseq2.colData, design = ~ condition) ``` ```{r, eval=TRUE, cache=TRUE} deseq2.dds <- DESeq(deseq2.dds) deseq2.res <- results(deseq2.dds) deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ] ``` ### Note sample treatments are not correct. ```{r, eval=FALSE} vsd <- vst(deseq2.dds, blind = FALSE) plotPCA(vsd, intgroup = "condition") ```