Use ShortStack (Axtell 2013; Shahid and Axtell 2014; Johnson et al. 2016)to perform alignment of sRNAseq data and annotation of sRNA-producing genes.
The A.millepora genome will be used as the reference genome for A.pulchra, as A.pulchra does not currently have a sequenced genome and A.millepora had highest alignment rates for standard RNAseq data compared to other published genomes tested.
Inputs:
Requires trimmed sRNAseq files generated by 08.2-Dapul-sRNAseq-trimming-31bp-fastp-merged.Rmd
*flexbar_trim.25bp*.gzA.millepora genome FastA. See 12-Apul-sRNAseq-MirMachine.Rmd for download info if needed.
Outputs:
Software requirements:
Replace with name of your ShortStack environment and the path to the corresponding conda installation (find this after you’ve activated the environment).
E.g.
# Activate environment
conda activate ShortStack4_env
# Find conda path
which condashortstack_conda_env_name <- c("ShortStack-4.0.3_env")
shortstack_cond_path <- c("/home/sam/programs/mambaforge/condabin/conda")This allows usage of Bash variables across R Markdown chunks.
{
echo "#### Assign Variables ####"
echo ""
echo "# Trimmed FastQ naming pattern"
echo "export trimmed_fastqs_pattern='*fastp-adapters-polyG-31bp-merged.fq.gz'"
echo "# Data directories"
echo 'export deep_dive_dir=/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive'
echo 'export deep_dive_data_dir="${deep_dive_dir}/data"'
echo 'export output_dir_top=${deep_dive_dir}/D-Apul/output/13.2-Apul-sRNAseq-ShortStack-31bp-fastp-merged'
echo 'export trimmed_fastqs_dir="${deep_dive_dir}/D-Apul/output/08.2-Dapul-sRNAseq-trimming-31bp-fastp-merged/trimmed-reads"'
echo ""
echo "# Input/Output files"
echo 'export genome_fasta_dir=${deep_dive_dir}/D-Apul/data/Amil/ncbi_dataset/data/GCF_013753865.1'
echo 'export genome_fasta_name="GCF_013753865.1_Amil_v2.1_genomic.fna"'
echo 'export shortstack_genome_fasta_name="GCF_013753865.1_Amil_v2.1_genomic.fa"'
echo 'export mirbase_mature_fasta=mature.fa'
echo 'export mirbase_mature_fasta_version=mirbase-mature-v22.1.fa'
echo 'export genome_fasta="${genome_fasta_dir}/${shortstack_genome_fasta_name}"'
echo ""
echo "# External data URLs"
echo 'export mirbase_fasta_url="https://mirbase.org/download_version_files/22.1/"'
echo ""
echo "# Set number of CPUs to use"
echo 'export threads=40'
echo ""
echo "# Initialize arrays"
echo 'export trimmed_fastqs_array=()'
} > .bashvars
cat .bashvars#### Assign Variables ####
# Trimmed FastQ naming pattern
export trimmed_fastqs_pattern='*fastp-adapters-polyG-31bp-merged.fq.gz'
# Data directories
export deep_dive_dir=/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive
export deep_dive_data_dir="${deep_dive_dir}/data"
export output_dir_top=${deep_dive_dir}/D-Apul/output/13.2-Apul-sRNAseq-ShortStack-31bp-fastp-merged
export trimmed_fastqs_dir="${deep_dive_dir}/D-Apul/output/08.2-Dapul-sRNAseq-trimming-31bp-fastp-merged/trimmed-reads"
# Input/Output files
export genome_fasta_dir=${deep_dive_dir}/D-Apul/data/Amil/ncbi_dataset/data/GCF_013753865.1
export genome_fasta_name="GCF_013753865.1_Amil_v2.1_genomic.fna"
export shortstack_genome_fasta_name="GCF_013753865.1_Amil_v2.1_genomic.fa"
export mirbase_mature_fasta=mature.fa
export mirbase_mature_fasta_version=mirbase-mature-v22.1.fa
export genome_fasta="${genome_fasta_dir}/${shortstack_genome_fasta_name}"
# External data URLs
export mirbase_fasta_url="https://mirbase.org/download_version_files/22.1/"
# Set number of CPUs to use
export threads=40
# Initialize arrays
export trimmed_fastqs_array=()If this is successful, the first line of output should show that the Python being used is the one in your [ShortStack](https://github.com/MikeAxtell/ShortStack conda environment path.
E.g.
python:         /home/sam/programs/mambaforge/envs/mirmachine_env/bin/python
use_condaenv(condaenv = shortstack_conda_env_name, conda = shortstack_cond_path)
# Check successful env loading
py_config()python:         /home/sam/programs/mambaforge/envs/ShortStack-4.0.3_env/bin/python
libpython:      /home/sam/programs/mambaforge/envs/ShortStack-4.0.3_env/lib/libpython3.10.so
pythonhome:     /home/sam/programs/mambaforge/envs/ShortStack-4.0.3_env:/home/sam/programs/mambaforge/envs/ShortStack-4.0.3_env
version:        3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0]
numpy:          /home/sam/programs/mambaforge/envs/ShortStack-4.0.3_env/lib/python3.10/site-packages/numpy
numpy_version:  1.26.4
NOTE: Python version was forced by use_python() function# Load bash variables into memory
source .bashvars
wget \
--directory-prefix ${deep_dive_data_dir} \
--recursive \
--no-check-certificate \
--continue \
--no-host-directories \
--no-directories \
--no-parent \
--quiet \
--execute robots=off \
 ${mirbase_fasta_url}/${mirbase_mature_fasta}
# Rename to indicate miRBase FastA version
mv ${deep_dive_data_dir}/${mirbase_mature_fasta} ${deep_dive_data_dir}/${mirbase_mature_fasta_version}
ls -lh "${deep_dive_data_dir}"total 13M
drwxr-xr-x 2 sam sam 4.0K Nov  7 14:36 blast_dbs
-rw-r--r-- 1 sam sam 3.7M Feb 15 08:42 mirbase-mature-v22.1.fa
-rw-r--r-- 1 sam sam 3.7M Dec  4 11:05 mirbase-mature-v22.1-no_spaces.fa
-rw-r--r-- 1 sam sam 3.7M Nov 17 07:59 mirbase-mature-v22.1-no_U.fa
-rw-r--r-- 1 sam sam 726K Nov  7 14:36 mirgene-mature-all-v2.1.fa
-rw-r--r-- 1 sam sam 726K Nov 17 07:59 mirgene-mature-all-v2.1-no_U.fa# Load bash variables into memory
source .bashvars
# Check for FastA file first
# Then create rename file if doesn't exist
if [ -f "${genome_fasta_dir}/${shortstack_genome_fasta_name}" ]; then
  echo "${genome_fasta_dir}/${shortstack_genome_fasta_name} already exists. Nothing to do."
  echo ""
else
  # Copy genome FastA to ShortStack-compatible filename (ending with .fa)
  cp ${genome_fasta_dir}/${genome_fasta_name} ${genome_fasta_dir}/${shortstack_genome_fasta_name}
fi
# Confirm
ls -lh ${genome_fasta_dir}/${shortstack_genome_fasta_name}/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/D-Apul/data/Amil/ncbi_dataset/data/GCF_013753865.1/GCF_013753865.1_Amil_v2.1_genomic.fa already exists. Nothing to do.
-rw-r--r-- 1 sam sam 460M Nov  6 12:40 /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/D-Apul/data/Amil/ncbi_dataset/data/GCF_013753865.1/GCF_013753865.1_Amil_v2.1_genomic.faUses the --dn_mirna option to identify miRNAs in the genome, without relying on the --known_miRNAs.
This part of the code redirects the output of time to the end of shortstack.log file.
; } \ 2>> ${output_dir_top}/shortstack.log# Load bash variables into memory
source .bashvars
# Make output directory, if it doesn't exist
mkdir --parents "${output_dir_top}"
# Create array of trimmed FastQs
trimmed_fastqs_array=(${trimmed_fastqs_dir}/${trimmed_fastqs_pattern})
# Pass array contents to new variable as space-delimited list
trimmed_fastqs_list=$(echo "${trimmed_fastqs_array[*]}")
###### Run ShortStack ######
{ time \
ShortStack \
--genomefile "${genome_fasta}" \
--readfile ${trimmed_fastqs_list} \
--known_miRNAs ${deep_dive_data_dir}/${mirbase_mature_fasta_version} \
--dn_mirna \
--threads ${threads} \
--outdir ${output_dir_top}/ShortStack_out \
&> ${output_dir_top}/shortstack.log ; } \
2>> ${output_dir_top}/shortstack.log
# Load bash variables into memory
source .bashvars
tail -n 3 ${output_dir_top}/shortstack.log \
| grep "real" \
| awk '{print "ShortStack runtime:" "\t" $2}'
ShortStack runtime: 50m0.594s# Load bash variables into memory
source .bashvars
tail -n 25 ${output_dir_top}/shortstack.logWriting final files
Found a total of 26 MIRNA loci
Non-MIRNA loci by DicerCall:
N 18777
22 43
23 38
21 11
24 7
Creating visualizations of microRNA loci with strucVis
<<< WARNING >>>
Do not rely on these results alone to annotate new MIRNA loci!
The false positive rate for de novo MIRNA identification is low, but NOT ZERO
Insepct each mirna locus, especially the strucVis output, and see
https://doi.org/10.1105/tpc.17.00851 , https://doi.org/10.1093/nar/gky1141
Thu 15 Feb 2024 08:27:13 -0800 PST
Run Completed!
real    50m0.594s
user    882m44.333s
sys 280m18.916sShortStack identified 26 miRNAs.
Results.txt# Load bash variables into memory
source .bashvars
head ${output_dir_top}/ShortStack_out/Results.txt
echo ""
echo "----------------------------------------------------------"
echo ""
echo "Nummber of potential loci:"
awk '(NR>1)' ${output_dir_top}/ShortStack_out/Results.txt | wc -lLocus   Name    Chrom   Start   End Length  Reads   DistinctSequences   FracTop Strand  MajorRNA    MajorRNAReads   Short   Long    21  22  23  24  DicerCall   MIRNA   known_miRNAs
NC_058066.1:152483-152906   Cluster_1   NC_058066.1 152483  152906  424 138 35  0.043478260869565216    -   UAAGUACUUUAUCAACUAACUCUAGGCA    70  1   127 0   1   0   9   N   N   NA
NC_058066.1:161064-161674   Cluster_2   NC_058066.1 161064  161674  611 511 231 0.2896281800391389  .   UUUUAGCCUAGUGCGGGUUUCCAGACGU    43  25  453 10  1   6   16  N   N   NA
NC_058066.1:172073-172496   Cluster_3   NC_058066.1 172073  172496  424 102 42  0.0784313725490196  -   GCGAUUAUUAACGGCUGGAACGAC    13  2   84  0   1   0   15  N   N   NA
NC_058066.1:203242-203651   Cluster_4   NC_058066.1 203242  203651  410 103 49  0.5922330097087378  .   UUCUGACUCUAUUAGCAACGAAGACUUU    25  0   103 0   0   0   0   N   N   NA
NC_058066.1:204533-205150   Cluster_5   NC_058066.1 204533  205150  618 337 174 0.7833827893175074  .   UCCCAACACGUCUAGACUGUACAAUUUCU   32  3   322 1   2   6   3   N   N   NA
NC_058066.1:205742-206966   Cluster_6   NC_058066.1 205742  206966  1225    2001    407 0.3313343328335832  .   CAAAAGAGCGGACAAAAUAGUCGACAGAUU  805 6   1958    7   4   6   20  N   N   NA
NC_058066.1:210841-211344   Cluster_7   NC_058066.1 210841  211344  504 1245    343 0.7534136546184739  .   UAAUACUUGUAGUGAAGGUUCAAUCUCGA   83  7   1136    7   9   20  66  N   N   NA
NC_058066.1:349655-351298   Cluster_8   NC_058066.1 349655  351298  1644    3272    1175    0.8092909535452323  +   UCAGCUUGGAAAUGACAGCUUUUGACGU    257 24  3139    12  24  19  54  N   N   NA
NC_058066.1:351491-353439   Cluster_9   NC_058066.1 351491  353439  1949    8880    1612    0.4152027027027027  .   UUUCAAAUCAAAGAUCUUCGCAACGAUGA   805 69  8504    34  34  123 116 N   N   NA
----------------------------------------------------------
Nummber of potential loci:
18902Column 20 of the Results.txt file identifies if a cluster is a miRNA or not (Y or N).
# Load bash variables into memory
source .bashvars
echo "Number of loci characterized as miRNA:"
awk '$20=="Y" {print $0}' ${output_dir_top}/ShortStack_out/Results.txt \
| wc -l
echo ""
echo "----------------------------------------------------------"
echo ""
echo "Number of loci _not_ characterized as miRNA:"
awk '$20=="N" {print $0}' ${output_dir_top}/ShortStack_out/Results.txt \
| wc -l
Number of loci characterized as miRNA:
26
----------------------------------------------------------
Number of loci _not_ characterized as miRNA:
18876Column 21 of the Results.txt file identifies if a cluster aligned to a known miRNA (miRBase) or not (Y or NA).
Since there are no miRNAs, the following code will not print any output.
The echo command after the awk command is simply there to prove that the chunk executed.
# Load bash variables into memory
source .bashvars
echo "Number of loci matching miRBase miRNAs:"
awk '$21!="NA" {print $0}' ${output_dir_top}/ShortStack_out/Results.txt \
| wc -l
echo ""
echo "----------------------------------------------------------"
echo ""
echo "Number of loci _not_ matching miRBase miRNAs:"
awk '$21=="NA" {print $0}' ${output_dir_top}/ShortStack_out/Results.txt \
| wc -l
Number of loci matching miRBase miRNAs:
34
----------------------------------------------------------
Number of loci _not_ matching miRBase miRNAs:
18869Although there are loci with matches to miRBase miRNAs, ShortStack did not annotate these clusters as miRNAs likely because they do not also match secondary structure criteria.
Many of these are large (by GitHub standards) BAM files, so will not be added to the repo.
Additionally, it’s unlikely we’ll utilize most of the other files (bigwig) generated by ShortStack.
# Load bash variables into memory
source .bashvars
tree -h ${output_dir_top}/
/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/D-Apul/output/13.2-Apul-sRNAseq-ShortStack-31bp-fastp-merged/
├── [ 21K]  shortstack.log
└── [ 36K]  ShortStack_out
    ├── [ 28K]  alignment_details.tsv
    ├── [1.1M]  Counts.txt
    ├── [ 87K]  known_miRNAs.gff3
    ├── [1.8M]  known_miRNAs_unaligned.fasta
    ├── [5.3M]  merged_alignments_21_m.bw
    ├── [5.8M]  merged_alignments_21_p.bw
    ├── [5.1M]  merged_alignments_22_m.bw
    ├── [5.5M]  merged_alignments_22_p.bw
    ├── [ 10M]  merged_alignments_23-24_m.bw
    ├── [ 11M]  merged_alignments_23-24_p.bw
    ├── [1.4G]  merged_alignments.bam
    ├── [226K]  merged_alignments.bam.csi
    ├── [ 64M]  merged_alignments_other_m.bw
    ├── [ 67M]  merged_alignments_other_p.bw
    ├── [ 46M]  merged_alignments_sRNA-ACR-140-S1-TP2-fastp-adapters-polyG-31bp-merged.bw
    ├── [ 50M]  merged_alignments_sRNA-ACR-145-S1-TP2-fastp-adapters-polyG-31bp-merged.bw
    ├── [ 48M]  merged_alignments_sRNA-ACR-150-S1-TP2-fastp-adapters-polyG-31bp-merged.bw
    ├── [ 41M]  merged_alignments_sRNA-ACR-173-S1-TP2-fastp-adapters-polyG-31bp-merged.bw
    ├── [ 41M]  merged_alignments_sRNA-ACR-178-S1-TP2-fastp-adapters-polyG-31bp-merged.bw
    ├── [7.5K]  mir.fasta
    ├── [1.9M]  Results.gff3
    ├── [2.8M]  Results.txt
    ├── [256M]  sRNA-ACR-140-S1-TP2-fastp-adapters-polyG-31bp-merged.bam
    ├── [223K]  sRNA-ACR-140-S1-TP2-fastp-adapters-polyG-31bp-merged.bam.csi
    ├── [291M]  sRNA-ACR-145-S1-TP2-fastp-adapters-polyG-31bp-merged.bam
    ├── [227K]  sRNA-ACR-145-S1-TP2-fastp-adapters-polyG-31bp-merged.bam.csi
    ├── [304M]  sRNA-ACR-150-S1-TP2-fastp-adapters-polyG-31bp-merged.bam
    ├── [229K]  sRNA-ACR-150-S1-TP2-fastp-adapters-polyG-31bp-merged.bam.csi
    ├── [268M]  sRNA-ACR-173-S1-TP2-fastp-adapters-polyG-31bp-merged.bam
    ├── [227K]  sRNA-ACR-173-S1-TP2-fastp-adapters-polyG-31bp-merged.bam.csi
    ├── [240M]  sRNA-ACR-178-S1-TP2-fastp-adapters-polyG-31bp-merged.bam
    ├── [228K]  sRNA-ACR-178-S1-TP2-fastp-adapters-polyG-31bp-merged.bam.csi
    └── [4.0K]  strucVis
        ├── [ 12K]  Cluster_10064.ps
        ├── [ 30K]  Cluster_10064.txt
        ├── [ 11K]  Cluster_10195.ps
        ├── [5.5K]  Cluster_10195.txt
        ├── [ 12K]  Cluster_10494.ps
        ├── [ 11K]  Cluster_10494.txt
        ├── [ 12K]  Cluster_1898.ps
        ├── [8.7K]  Cluster_1898.txt
        ├── [ 12K]  Cluster_1998.ps
        ├── [ 32K]  Cluster_1998.txt
        ├── [ 12K]  Cluster_2072.ps
        ├── [ 41K]  Cluster_2072.txt
        ├── [ 12K]  Cluster_2520.ps
        ├── [ 39K]  Cluster_2520.txt
        ├── [ 11K]  Cluster_2521.ps
        ├── [ 28K]  Cluster_2521.txt
        ├── [ 11K]  Cluster_3015.ps
        ├── [5.0K]  Cluster_3015.txt
        ├── [ 12K]  Cluster_3083.ps
        ├── [6.4K]  Cluster_3083.txt
        ├── [ 11K]  Cluster_321.ps
        ├── [ 18K]  Cluster_321.txt
        ├── [ 12K]  Cluster_3674.ps
        ├── [ 15K]  Cluster_3674.txt
        ├── [ 12K]  Cluster_4250.ps
        ├── [ 29K]  Cluster_4250.txt
        ├── [ 11K]  Cluster_4864.ps
        ├── [ 15K]  Cluster_4864.txt
        ├── [ 11K]  Cluster_518.ps
        ├── [3.5K]  Cluster_518.txt
        ├── [ 12K]  Cluster_5243.ps
        ├── [ 57K]  Cluster_5243.txt
        ├── [ 12K]  Cluster_552.ps
        ├── [ 51K]  Cluster_552.txt
        ├── [ 12K]  Cluster_6374.ps
        ├── [ 21K]  Cluster_6374.txt
        ├── [ 11K]  Cluster_6437.ps
        ├── [ 42K]  Cluster_6437.txt
        ├── [ 12K]  Cluster_6959.ps
        ├── [ 42K]  Cluster_6959.txt
        ├── [ 12K]  Cluster_6965.ps
        ├── [ 11K]  Cluster_6965.txt
        ├── [ 12K]  Cluster_6977.ps
        ├── [ 63K]  Cluster_6977.txt
        ├── [ 11K]  Cluster_7075.ps
        ├── [ 24K]  Cluster_7075.txt
        ├── [ 11K]  Cluster_8478.ps
        ├── [4.6K]  Cluster_8478.txt
        ├── [ 12K]  Cluster_8546.ps
        ├── [ 28K]  Cluster_8546.txt
        ├── [ 12K]  Cluster_9465.ps
        └── [ 28K]  Cluster_9465.txt
2 directories, 85 files