Use ShortStack (Axtell 2013; Johnson et al. 2016; Shahid and Axtell 2014) to perform alignment of sRNAseq data and annotation of sRNA-producing genes.
Inputs:
Requires trimmed sRNAseq files generated by 06-Peve-sRNAseq-trimming.Rmd
*flexbar_trim.25bp*.gz
A.millepora genome FastA. See 07-Peve-sRNAseq-MirMachine.md for download info if needed.
Outputs:
Software requirements:
Replace with name of your ShortStack environment and the path to the corresponding conda installation (find this after you’ve activated the environment).
E.g.
# Activate environment
conda activate ShortStack4_env
# Find conda path
which conda
shortstack_conda_env_name <- c("ShortStack4_env")
shortstack_cond_path <- c("/home/sam/programs/mambaforge/condabin/conda")
This allows usage of Bash variables across R Markdown chunks.
{
echo "#### Assign Variables ####"
echo ""
echo "# Trimmed FastQ naming pattern"
echo "export trimmed_fastqs_pattern='*flexbar_trim.25bp*.fastq.gz'"
echo "# Data directories"
echo 'export deep_dive_dir=/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive'
echo 'export deep_dive_data_dir="${deep_dive_dir}/data"'
echo 'export output_dir_top=${deep_dive_dir}/E-Peve/output/08-Peve-sRNAseq-ShortStack'
echo 'export trimmed_fastqs_dir="${deep_dive_dir}/E-Peve/output/06-Peve-sRNAseq-trimming/trimmed-reads"'
echo ""
echo "# Input/Output files"
echo 'export genome_fasta_dir=${deep_dive_dir}/E-Peve/data'
echo 'export genome_fasta_name="Porites_evermanni_v1.fa"'
echo 'export shortstack_genome_fasta_name="Porites_evermanni_v1.fa"'
echo 'export mirbase_mature_fasta=mature.fa'
echo 'export mirbase_mature_fasta_version=mirbase-mature-v22.1.fa'
echo 'export genome_fasta="${genome_fasta_dir}/${shortstack_genome_fasta_name}"'
echo ""
echo "# External data URLs"
echo 'export mirbase_fasta_url="https://mirbase.org/download_version_files/22.1/"'
echo ""
echo "# Set number of CPUs to use"
echo 'export threads=46'
echo ""
echo "# Initialize arrays"
echo 'export trimmed_fastqs_array=()'
} > .bashvars
cat .bashvars
#### Assign Variables ####
# Trimmed FastQ naming pattern
export trimmed_fastqs_pattern='*flexbar_trim.25bp*.fastq.gz'
# Data directories
export deep_dive_dir=/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive
export deep_dive_data_dir="${deep_dive_dir}/data"
export output_dir_top=${deep_dive_dir}/E-Peve/output/08-Peve-sRNAseq-ShortStack
export trimmed_fastqs_dir="${deep_dive_dir}/E-Peve/output/06-Peve-sRNAseq-trimming/trimmed-reads"
# Input/Output files
export genome_fasta_dir=${deep_dive_dir}/E-Peve/data
export genome_fasta_name="Porites_evermanni_v1.fa"
export shortstack_genome_fasta_name="Porites_evermanni_v1.fa"
export mirbase_mature_fasta=mature.fa
export mirbase_mature_fasta_version=mirbase-mature-v22.1.fa
export genome_fasta="${genome_fasta_dir}/${shortstack_genome_fasta_name}"
# External data URLs
export mirbase_fasta_url="https://mirbase.org/download_version_files/22.1/"
# Set number of CPUs to use
export threads=46
# Initialize arrays
export trimmed_fastqs_array=()
If this is successful, the first line of output should show that the Python being used is the one in your [ShortStack](https://github.com/MikeAxtell/ShortStack conda environment path.
E.g.
python: /home/sam/programs/mambaforge/envs/mirmachine_env/bin/python
use_condaenv(condaenv = shortstack_conda_env_name, conda = shortstack_cond_path)
# Check successful env loading
py_config()
python: /home/sam/programs/mambaforge/envs/ShortStack4_env/bin/python
libpython: /home/sam/programs/mambaforge/envs/ShortStack4_env/lib/libpython3.10.so
pythonhome: /home/sam/programs/mambaforge/envs/ShortStack4_env:/home/sam/programs/mambaforge/envs/ShortStack4_env
version: 3.10.13 | packaged by conda-forge | (main, Oct 26 2023, 18:07:37) [GCC 12.3.0]
numpy: /home/sam/programs/mambaforge/envs/ShortStack4_env/lib/python3.10/site-packages/numpy
numpy_version: 1.26.0
NOTE: Python version was forced by use_python() function
# Load bash variables into memory
source .bashvars
wget \
--directory-prefix ${deep_dive_data_dir} \
--recursive \
--no-check-certificate \
--continue \
--no-host-directories \
--no-directories \
--no-parent \
--quiet \
--execute robots=off \
${mirbase_fasta_url}/${mirbase_mature_fasta}
# Rename to indicate miRBase FastA version
mv ${deep_dive_data_dir}/${mirbase_mature_fasta} ${deep_dive_data_dir}/${mirbase_mature_fasta_version}
ls -lh "${deep_dive_data_dir}"
total 13M
drwxr-xr-x 2 sam sam 4.0K Nov 7 14:36 blast_dbs
-rw-r--r-- 1 sam sam 3.7M Nov 15 21:30 mirbase-mature-v22.1.fa
-rw-r--r-- 1 sam sam 3.7M Nov 14 09:39 mirbase-mature-v22.1-no_spaces.fa
-rw-r--r-- 1 sam sam 3.7M Nov 7 14:31 mirbase-mature-v22.1-no_U.fa
-rw-r--r-- 1 sam sam 726K Nov 7 14:36 mirgene-mature-all-v2.1.fa
-rw-r--r-- 1 sam sam 726K Nov 7 14:31 mirgene-mature-all-v2.1-no_U.fa
Uses the --dn_mirna
option to identify miRNAs in the genome, without relying on the --known_miRNAs
.
This part of the code redirects the output of time
to the end of shortstack.log
file.
; } \ 2>> ${output_dir_top}/shortstack.log
# Load bash variables into memory
source .bashvars
# Create array of trimmed FastQs
trimmed_fastqs_array=(${trimmed_fastqs_dir}/${trimmed_fastqs_pattern})
# Pass array contents to new variable as space-delimited list
trimmed_fastqs_list=$(echo "${trimmed_fastqs_array[*]}")
###### Run ShortStack ######
{ time \
ShortStack \
--genomefile "${genome_fasta}" \
--readfile ${trimmed_fastqs_list} \
--known_miRNAs ${deep_dive_data_dir}/${mirbase_mature_fasta_version} \
--dn_mirna \
--threads ${threads} \
--outdir ${output_dir_top}/ShortStack_out \
&> ${output_dir_top}/shortstack.log ; } \
2>> ${output_dir_top}/shortstack.log
# Load bash variables into memory
source .bashvars
tail -n 3 ${output_dir_top}/shortstack.log \
| grep "real" \
| awk '{print "ShortStack runtime:" "\t" $2}'
ShortStack runtime: 75m11.616s
# Load bash variables into memory
source .bashvars
tail -n 20 ${output_dir_top}/shortstack.log
Screening of possible de novo microRNAs
No microRNA loci were found!
Writing final files
Non-MIRNA loci by DicerCall:
N 14896
22 56
23 43
24 24
21 21
Wed 15 Nov 2023 20:35:51 -0800 PST
Run Completed!
real 75m11.616s
user 1670m15.527s
sys 461m28.418s
ShortStack didn’t identify any miRNAs.
Results.txt
# Load bash variables into memory
source .bashvars
head ${output_dir_top}/ShortStack_out/Results.txt
echo ""
echo "----------------------------------------------------------"
echo ""
echo "Nummber of potential loci:"
awk '(NR>1)' ${output_dir_top}/ShortStack_out/Results.txt | wc -l
Locus Name Chrom Start End Length Reads UniqueReads FracTop Strand MajorRNA MajorRNAReads Short Long 21 22 23 24 DicerCall MIRNA known_miRNAs
Porites_evermani_scaffold_1:45716-46125 Cluster_1 Porites_evermani_scaffold_1 45716 46125 410 215 93 0.49767441860465117 . ACUGAUUCUUGGCCACCUCUACUGU 9 45 96 18 16 18 22 N N NA
Porites_evermani_scaffold_1:406307-406731 Cluster_2 Porites_evermani_scaffold_1 406307 406731 425 340 103 0.5029411764705882 . UGAGUGUAUUCUUGAACUGUUUUCC 73 2 314 2 1 9 12 N N NA
Porites_evermani_scaffold_1:409840-410269 Cluster_3 Porites_evermani_scaffold_1 409840 410269 430 365 55 0.4876712328767123 . UGGAACUCCGAUUUAGAACUUGCAA 124 0 355 1 3 0 6 N N NA
Porites_evermani_scaffold_1:465246-465668 Cluster_4 Porites_evermani_scaffold_1 465246 465668 423 371 97 0.49326145552560646 . AAGUUGCUCUGAAGAUUAUGU 39 66 140 96 15 40 14 N N NA
Porites_evermani_scaffold_1:468473-468950 Cluster_5 Porites_evermani_scaffold_1 468473 468950 478 231785 1297 0.49824190521388356 . AAUUCAGAAAAACUGAACAGUCAUC 89861 3892 224933 229 276 305 2150 N N NA
Porites_evermani_scaffold_1:476827-477250 Cluster_6 Porites_evermani_scaffold_1 476827 477250 424 241 67 0.4896265560165975 . CGUGUCUUCGUAAUCGUCUCGUACA 14 66 85 0 24 30 36 N N NA
Porites_evermani_scaffold_1:486444-486864 Cluster_7 Porites_evermani_scaffold_1 486444 486864 421 116 17 0.49137931034482757 . AUAUUGACGAAUCCUGGCCUAGUGA 49 0 108 0 0 8 0 N N NA
Porites_evermani_scaffold_1:534428-534866 Cluster_8 Porites_evermani_scaffold_1 534428 534866 439 1252 200 0.4976038338658147 . AGCCAAAGCUAUAAGUUUACGUCUC 298 62 892 120 28 70 80 N N NA
Porites_evermani_scaffold_1:729098-729587 Cluster_9 Porites_evermani_scaffold_1 729098 729587 490 13328 605 0.49797418967587037 . UGUGGUCAGACAGUUCUGGCUUGUC 4355 409 11693 262 478 244 242 N N NA
----------------------------------------------------------
Nummber of potential loci:
15040
Column 20 of the Results.txt
file identifies if a cluster is a miRNA or not (Y
or N
).
# Load bash variables into memory
source .bashvars
echo "Number of loci characterized as miRNA:"
awk '$20=="Y" {print $0}' ${output_dir_top}/ShortStack_out/Results.txt \
| wc -l
echo ""
echo "----------------------------------------------------------"
echo ""
echo "Number of loci _not_ characterized as miRNA:"
awk '$20=="N" {print $0}' ${output_dir_top}/ShortStack_out/Results.txt \
| wc -l
Number of loci characterized as miRNA:
0
----------------------------------------------------------
Number of loci _not_ characterized as miRNA:
15040
Column 21 of the Results.txt
file identifies if a cluster aligned to a known miRNA (miRBase) or not (Y
or NA
).
Since there are no miRNAs, the following code will not print any output.
The echo
command after the awk
command is simply there to prove that the chunk executed.
# Load bash variables into memory
source .bashvars
echo "Number of loci matching miRBase miRNAs:"
awk '$21!="NA" {print $0}' ${output_dir_top}/ShortStack_out/Results.txt \
| wc -l
echo ""
echo "----------------------------------------------------------"
echo ""
echo "Number of loci _not_ matching miRBase miRNAs:"
awk '$21=="NA" {print $0}' ${output_dir_top}/ShortStack_out/Results.txt \
| wc -l
Number of loci matching miRBase miRNAs:
25
----------------------------------------------------------
Number of loci _not_ matching miRBase miRNAs:
15016
Although there are loci with matches to miRBase miRNAs, ShortStack did not annotate these clusters as miRNAs likely because they do not also match secondary structure criteria.
Many of these are large (by GitHub standards) BAM files, so will not be added to the repo.
Additionally, it’s unlikely we’ll utilize most of the other files (bigwig) generated by ShortStack.
# Load bash variables into memory
source .bashvars
tree -h ${output_dir_top}/
/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/E-Peve/output/08-Peve-sRNAseq-ShortStack/
├── [ 21K] shortstack.log
└── [228K] ShortStack_out
├── [ 28K] alignment_details.tsv
├── [1.1M] Counts.txt
├── [204K] known_miRNAs.gff3
├── [1.8M] known_miRNAs_unaligned.fasta
├── [ 11M] merged_alignments_21_m.bw
├── [ 11M] merged_alignments_21_p.bw
├── [ 10M] merged_alignments_22_m.bw
├── [ 10M] merged_alignments_22_p.bw
├── [ 17M] merged_alignments_23-24_m.bw
├── [ 17M] merged_alignments_23-24_p.bw
├── [1.3G] merged_alignments.bam
├── [348K] merged_alignments.bam.csi
├── [ 77M] merged_alignments_other_m.bw
├── [ 79M] merged_alignments_other_p.bw
├── [ 32M] merged_alignments_sRNA-POR-73-S1-TP2.flexbar_trim.25bp_1.bw
├── [ 33M] merged_alignments_sRNA-POR-73-S1-TP2.flexbar_trim.25bp_2.bw
├── [ 45M] merged_alignments_sRNA-POR-79-S1-TP2.flexbar_trim.25bp_1.bw
├── [ 45M] merged_alignments_sRNA-POR-79-S1-TP2.flexbar_trim.25bp_2.bw
├── [ 51M] merged_alignments_sRNA-POR-82-S1-TP2.flexbar_trim.25bp_1.bw
├── [ 51M] merged_alignments_sRNA-POR-82-S1-TP2.flexbar_trim.25bp_2.bw
├── [1.7M] Results.gff3
├── [2.6M] Results.txt
├── [200M] sRNA-POR-73-S1-TP2.flexbar_trim.25bp_1.bam
├── [303K] sRNA-POR-73-S1-TP2.flexbar_trim.25bp_1.bam.csi
├── [206M] sRNA-POR-73-S1-TP2.flexbar_trim.25bp_2.bam
├── [304K] sRNA-POR-73-S1-TP2.flexbar_trim.25bp_2.bam.csi
├── [219M] sRNA-POR-79-S1-TP2.flexbar_trim.25bp_1.bam
├── [309K] sRNA-POR-79-S1-TP2.flexbar_trim.25bp_1.bam.csi
├── [228M] sRNA-POR-79-S1-TP2.flexbar_trim.25bp_2.bam
├── [310K] sRNA-POR-79-S1-TP2.flexbar_trim.25bp_2.bam.csi
├── [243M] sRNA-POR-82-S1-TP2.flexbar_trim.25bp_1.bam
├── [314K] sRNA-POR-82-S1-TP2.flexbar_trim.25bp_1.bam.csi
├── [251M] sRNA-POR-82-S1-TP2.flexbar_trim.25bp_2.bam
└── [315K] sRNA-POR-82-S1-TP2.flexbar_trim.25bp_2.bam.csi
1 directory, 35 files