Use ShortStack (Axtell2013-xu?; Johnson et al. 2016; Shahid2014-lx?) to perform alignment of sRNAseq data and annotation of sRNA-producing genes.
Due to large file sizes of some of the input and output files, not all files can be sync’d to GitHub. A full backup of this repo is available here:
Inputs:
Requires trimmed sRNAseq files generated by 01.00-trimming-fastp-fastqc.Rmd
*fastp-adapters-polyG-31bp-merged.fq.gzGenome FastA: GCF_026571515.1_ASM2657151v2_genomic.fna
MiRBase v22.1 FastA: mirbase-mature-v22.1.fa
Outputs:
Software requirements:
Replace with name of your ShortStack environment and the path to the corresponding conda installation (find this after you’ve activated the environment).
E.g.
# Activate environment
conda activate ShortStack-4.1.1_env
# Find conda path
which condashortstack_conda_env_name <- c("ShortStack-4.1.1_env")
shortstack_cond_path <- c("/home/sam/programs/mambaforge/condabin/conda")This allows usage of Bash variables across R Markdown chunks.
{
echo "#### Assign Variables ####"
echo ""
echo "# Trimmed FastQ naming pattern"
echo "export trimmed_fastqs_pattern='*fastp-adapters-polyG-31bp-merged.fq.gz'"
echo "# Data directories"
echo 'export repo_dir=/home/shared/8TB_HDD_01/sam/gitrepos/RobertsLab/project-clam-oa'
echo 'export repo_data_dir="${repo_dir}/data"'
echo 'export output_dir_top=${repo_dir}/output/02.00-ShortStack-31bp-fastp-merged'
echo 'export trimmed_fastqs_dir="${repo_dir}/output/01.00-trimming-fastp-fastqc"'
echo ""
echo "# Input/Output files"
echo 'export genome_fasta_dir=${repo_data_dir}/genome_files'
echo 'export genome_fasta_name="GCF_026571515.1_ASM2657151v2_genomic.fna"'
echo 'export shortstack_genome_fasta_name="GCF_026571515.1_ASM2657151v2_genomic.fa"'
echo 'export mirbase_mature_fasta=mirbase-mature-v22.1.fa'
echo 'export genome_fasta="${genome_fasta_dir}/${shortstack_genome_fasta_name}"'
echo ""
echo "# Set number of CPUs to use"
echo 'export threads=40'
echo ""
echo "# Initialize arrays"
echo 'export trimmed_fastqs_array=()'
} > .bashvars
cat .bashvars#### Assign Variables ####
# Trimmed FastQ naming pattern
export trimmed_fastqs_pattern='*fastp-adapters-polyG-31bp-merged.fq.gz'
# Data directories
export repo_dir=/home/shared/8TB_HDD_01/sam/gitrepos/RobertsLab/project-clam-oa
export repo_data_dir="${repo_dir}/data"
export output_dir_top=${repo_dir}/output/02.00-ShortStack-31bp-fastp-merged
export trimmed_fastqs_dir="${repo_dir}/output/01.00-trimming-fastp-fastqc"
# Input/Output files
export genome_fasta_dir=${repo_data_dir}/genome_files
export genome_fasta_name="GCF_026571515.1_ASM2657151v2_genomic.fna"
export shortstack_genome_fasta_name="GCF_026571515.1_ASM2657151v2_genomic.fa"
export mirbase_mature_fasta=mirbase-mature-v22.1.fa
export genome_fasta="${genome_fasta_dir}/${shortstack_genome_fasta_name}"
# Set number of CPUs to use
export threads=40
# Initialize arrays
export trimmed_fastqs_array=()If this is successful, the first line of output should show that the Python being used is the one in your [ShortStack](https://github.com/MikeAxtell/ShortStack conda environment path.
E.g.
python:         /home/sam/programs/mambaforge/envs/mirmachine_env/bin/python
use_condaenv(condaenv = shortstack_conda_env_name, conda = shortstack_cond_path)
# Check successful env loading
py_config()python:         /home/sam/programs/mambaforge/envs/ShortStack-4.1.1_env/bin/python
libpython:      /home/sam/programs/mambaforge/envs/ShortStack-4.1.1_env/lib/libpython3.12.so
pythonhome:     /home/sam/programs/mambaforge/envs/ShortStack-4.1.1_env:/home/sam/programs/mambaforge/envs/ShortStack-4.1.1_env
version:        3.12.8 | packaged by conda-forge | (main, Dec  5 2024, 14:24:40) [GCC 13.3.0]
numpy:          /home/sam/programs/mambaforge/envs/ShortStack-4.1.1_env/lib/python3.12/site-packages/numpy
numpy_version:  2.2.0
NOTE: Python version was forced by use_python() functionUses the --dn_mirna option to identify miRNAs in the genome, without relying on the --known_miRNAs.
This part of the code redirects the output of time to the end of shortstack.log file.
; } \ 2>> ${output_dir_top}/shortstack.log# Load bash variables into memory
source .bashvars
# Make output directory, if it doesn't exist
mkdir --parents "${output_dir_top}"
# Create array of trimmed FastQs
trimmed_fastqs_array=(${trimmed_fastqs_dir}/${trimmed_fastqs_pattern})
# Pass array contents to new variable as space-delimited list
trimmed_fastqs_list=$(echo "${trimmed_fastqs_array[*]}")
# Rename genome FastA to ShortStack naming convention
cp "${genome_fasta_dir}"/"${genome_fasta_name}" "${genome_fasta}"
###### Run ShortStack ######
{ time \
ShortStack \
--genomefile "${genome_fasta}" \
--readfile ${trimmed_fastqs_list} \
--known_miRNAs ${repo_data_dir}/${mirbase_mature_fasta} \
--dn_mirna \
--threads ${threads} \
--outdir ${output_dir_top}/ShortStack_out \
&> ${output_dir_top}/shortstack.log ; } \
2>> ${output_dir_top}/shortstack.log
# Load bash variables into memory
source .bashvars
tail -n 3 ${output_dir_top}/shortstack.log \
| grep "real" \
| awk '{print "ShortStack runtime:" "\t" $2}'
ShortStack runtime: 52m22.645s# Load bash variables into memory
source .bashvars
tail -n 25 ${output_dir_top}/shortstack.logWriting final files
Found a total of 37 MIRNA loci
Non-MIRNA loci by DicerCall:
N 33525
22 60
21 18
23 17
24 9
Creating visualizations of microRNA loci with strucVis
<<< WARNING >>>
Do not rely on these results alone to annotate new MIRNA loci!
The false positive rate for de novo MIRNA identification is low, but NOT ZERO
Insepct each mirna locus, especially the strucVis output, and see
https://doi.org/10.1105/tpc.17.00851 , https://doi.org/10.1093/nar/gky1141
Tue 10 Dec 2024 08:37:06 -0800 PST
Run Completed!
real    52m22.645s
user    672m59.895s
sys 196m42.848sShortStack found NN miRNAs.
Results.txt# Load bash variables into memory
source .bashvars
head ${output_dir_top}/ShortStack_out/Results.txt
echo ""
echo "----------------------------------------------------------"
echo ""
echo "Nummber of potential loci:"
awk '(NR>1)' ${output_dir_top}/ShortStack_out/Results.txt | wc -lLocus   Name    Chrom   Start   End Length  Reads   DistinctSequences   FracTop Strand  MajorRNA    MajorRNAReads   Short   Long    21  22  23  24  DicerCall   MIRNA   known_miRNAs
NW_026851514.1:11444-11873  Cluster_1   NW_026851514.1  11444   11873   430 934 210 0.913   +   UUGAAUUCUGCACACUACUUAUGAUAAAAGU 171 8   910 3   0   4   9   N   N   NA
NW_026851514.1:12401-12830  Cluster_2   NW_026851514.1  12401   12830   430 535 110 0.94    +   UGAUAACUCUUUUAACUGAUUCAUACGAAC  318 1   523 3   2   1   5   N   N   NA
NW_026851515.1:76365-77030  Cluster_3   NW_026851515.1  76365   77030   666 11653   547 0.003   -   UCCUACGAUCAAAGUUCGGCAACGUUCGAC  3215    5   11536   10  11  54  37  N   N   NA
NW_026851515.1:77089-77506  Cluster_4   NW_026851515.1  77089   77506   418 798 196 0.044   -   UACUAGUACCUCUUCGAUUGCAUUUU  100 5   732 3   16  22  20  N   N   NA
NW_026851515.1:77511-78358  Cluster_5   NW_026851515.1  77511   78358   848 5726    946 0.004   -   UAGAUAUGUCACUGUUUAUUUCAUUGUC    661 53  5066    261 114 98  134 N   N   NA
NW_026851515.1:78386-79090  Cluster_6   NW_026851515.1  78386   79090   705 1240    239 0.011   -   UGUAGUUCUUUGAAUAUAUCUCAGUCAUUG  264 6   1217    1   5   5   6   N   N   NA
NW_026851515.1:79280-79708  Cluster_7   NW_026851515.1  79280   79708   429 946 115 0.011   -   UUAUAUAUGUUCUUGCUGAUCUUAAUUGG   396 4   927 9   1   2   3   N   N   NA
NW_026851515.1:79774-80496  Cluster_8   NW_026851515.1  79774   80496   723 7823    1009    0.015   -   UUUGAUCGCUGUUUUUCAAUAUGACUGUGC  848 90  7238    248 71  48  128 N   N   NA
NW_026851515.1:88853-89546  Cluster_9   NW_026851515.1  88853   89546   694 1600    307 0.036   -   UCUGACUGUUUAUGUGUUUAAUAUAUAACC  221 10  1541    1   11  21  16  N   N   NA
----------------------------------------------------------
Nummber of potential loci:
33666Column 20 of the Results.txt file identifies if a cluster is a miRNA or not (Y or N).
# Load bash variables into memory
source .bashvars
echo "Number of loci characterized as miRNA:"
awk '$20=="Y" {print $0}' ${output_dir_top}/ShortStack_out/Results.txt \
| wc -l
echo ""
echo "----------------------------------------------------------"
echo ""
echo "Number of loci _not_ characterized as miRNA:"
awk '$20=="N" {print $0}' ${output_dir_top}/ShortStack_out/Results.txt \
| wc -l
Number of loci characterized as miRNA:
37
----------------------------------------------------------
Number of loci _not_ characterized as miRNA:
33629Column 21 of the Results.txt file identifies if a cluster aligned to a known miRNA (miRBase) or not (Y or NA).
# Load bash variables into memory
source .bashvars
echo "Number of loci matching miRBase miRNAs:"
awk '$21!="NA" {print $0}' ${output_dir_top}/ShortStack_out/Results.txt \
| wc -l
echo ""
echo "----------------------------------------------------------"
echo ""
echo "Number of loci _not_ matching miRBase miRNAs:"
awk '$21=="NA" {print $0}' ${output_dir_top}/ShortStack_out/Results.txt \
| wc -l
Number of loci matching miRBase miRNAs:
92
----------------------------------------------------------
Number of loci _not_ matching miRBase miRNAs:
33575Although there are 92 loci with matches to miRBase miRNAs, ShortStack did not annotate 55 of these clusters as miRNAs likely because they do not also match secondary structure criteria.
This explains the difference between the 46 and 37 miRNAs.
Many of these are large (by GitHub standards) BAM files, so will not be added to the repo.
Additionally, it’s unlikely we’ll utilize most of the other files (bigwig) generated by ShortStack.
# Load bash variables into memory
source .bashvars
tree -h ${output_dir_top}/
/home/shared/8TB_HDD_01/sam/gitrepos/RobertsLab/project-clam-oa/output/02.00-ShortStack-31bp-fastp-merged/
├── [ 27K]  shortstack.log
└── [436K]  ShortStack_out
    ├── [ 84M]  196-fastp-adapters-polyG-31bp-merged_condensed.bam
    ├── [697K]  196-fastp-adapters-polyG-31bp-merged_condensed.bam.csi
    ├── [270M]  196-fastp-adapters-polyG-31bp-merged_condensed.fa
    ├── [ 64M]  199-fastp-adapters-polyG-31bp-merged_condensed.bam
    ├── [642K]  199-fastp-adapters-polyG-31bp-merged_condensed.bam.csi
    ├── [205M]  199-fastp-adapters-polyG-31bp-merged_condensed.fa
    ├── [ 76M]  211-fastp-adapters-polyG-31bp-merged_condensed.bam
    ├── [666K]  211-fastp-adapters-polyG-31bp-merged_condensed.bam.csi
    ├── [242M]  211-fastp-adapters-polyG-31bp-merged_condensed.fa
    ├── [ 72M]  24-fastp-adapters-polyG-31bp-merged_condensed.bam
    ├── [670K]  24-fastp-adapters-polyG-31bp-merged_condensed.bam.csi
    ├── [223M]  24-fastp-adapters-polyG-31bp-merged_condensed.fa
    ├── [ 71M]  260-fastp-adapters-polyG-31bp-merged_condensed.bam
    ├── [672K]  260-fastp-adapters-polyG-31bp-merged_condensed.bam.csi
    ├── [221M]  260-fastp-adapters-polyG-31bp-merged_condensed.fa
    ├── [ 61M]  26-fastp-adapters-polyG-31bp-merged_condensed.bam
    ├── [646K]  26-fastp-adapters-polyG-31bp-merged_condensed.bam.csi
    ├── [187M]  26-fastp-adapters-polyG-31bp-merged_condensed.fa
    ├── [ 74M]  30-fastp-adapters-polyG-31bp-merged_condensed.bam
    ├── [658K]  30-fastp-adapters-polyG-31bp-merged_condensed.bam.csi
    ├── [235M]  30-fastp-adapters-polyG-31bp-merged_condensed.fa
    ├── [ 71M]  310-fastp-adapters-polyG-31bp-merged_condensed.bam
    ├── [667K]  310-fastp-adapters-polyG-31bp-merged_condensed.bam.csi
    ├── [222M]  310-fastp-adapters-polyG-31bp-merged_condensed.fa
    ├── [ 72M]  33-fastp-adapters-polyG-31bp-merged_condensed.bam
    ├── [665K]  33-fastp-adapters-polyG-31bp-merged_condensed.bam.csi
    ├── [227M]  33-fastp-adapters-polyG-31bp-merged_condensed.fa
    ├── [ 69M]  341-fastp-adapters-polyG-31bp-merged_condensed.bam
    ├── [666K]  341-fastp-adapters-polyG-31bp-merged_condensed.bam.csi
    ├── [213M]  341-fastp-adapters-polyG-31bp-merged_condensed.fa
    ├── [ 79M]  34-fastp-adapters-polyG-31bp-merged_condensed.bam
    ├── [669K]  34-fastp-adapters-polyG-31bp-merged_condensed.bam.csi
    ├── [251M]  34-fastp-adapters-polyG-31bp-merged_condensed.fa
    ├── [ 64M]  35-fastp-adapters-polyG-31bp-merged_condensed.bam
    ├── [648K]  35-fastp-adapters-polyG-31bp-merged_condensed.bam.csi
    ├── [199M]  35-fastp-adapters-polyG-31bp-merged_condensed.fa
    ├── [ 68M]  363-fastp-adapters-polyG-31bp-merged_condensed.bam
    ├── [640K]  363-fastp-adapters-polyG-31bp-merged_condensed.bam.csi
    ├── [216M]  363-fastp-adapters-polyG-31bp-merged_condensed.fa
    ├── [ 74M]  367-fastp-adapters-polyG-31bp-merged_condensed.bam
    ├── [673K]  367-fastp-adapters-polyG-31bp-merged_condensed.bam.csi
    ├── [231M]  367-fastp-adapters-polyG-31bp-merged_condensed.fa
    ├── [104M]  376-fastp-adapters-polyG-31bp-merged_condensed.bam
    ├── [703K]  376-fastp-adapters-polyG-31bp-merged_condensed.bam.csi
    ├── [339M]  376-fastp-adapters-polyG-31bp-merged_condensed.fa
    ├── [ 72M]  460-fastp-adapters-polyG-31bp-merged_condensed.bam
    ├── [671K]  460-fastp-adapters-polyG-31bp-merged_condensed.bam.csi
    ├── [230M]  460-fastp-adapters-polyG-31bp-merged_condensed.fa
    ├── [ 72M]  485-fastp-adapters-polyG-31bp-merged_condensed.bam
    ├── [665K]  485-fastp-adapters-polyG-31bp-merged_condensed.bam.csi
    ├── [228M]  485-fastp-adapters-polyG-31bp-merged_condensed.fa
    ├── [ 85M]  501-fastp-adapters-polyG-31bp-merged_condensed.bam
    ├── [697K]  501-fastp-adapters-polyG-31bp-merged_condensed.bam.csi
    ├── [269M]  501-fastp-adapters-polyG-31bp-merged_condensed.fa
    ├── [ 99M]  71-fastp-adapters-polyG-31bp-merged_condensed.bam
    ├── [725K]  71-fastp-adapters-polyG-31bp-merged_condensed.bam.csi
    ├── [320M]  71-fastp-adapters-polyG-31bp-merged_condensed.fa
    ├── [ 78M]  88-fastp-adapters-polyG-31bp-merged_condensed.bam
    ├── [683K]  88-fastp-adapters-polyG-31bp-merged_condensed.bam.csi
    ├── [250M]  88-fastp-adapters-polyG-31bp-merged_condensed.fa
    ├── [111K]  alignment_details.tsv
    ├── [3.4M]  Counts.txt
    ├── [1.6M]  known_miRNAs.gff3
    ├── [1.7M]  known_miRNAs_unaligned.fasta
    ├── [1.3G]  merged_alignments.bam
    ├── [747K]  merged_alignments.bam.csi
    ├── [ 11K]  mir.fasta
    ├── [3.3M]  Results.gff3
    ├── [4.8M]  Results.txt
    └── [4.0K]  strucVis
        ├── [8.0K]  Cluster_11267.ps.pdf
        ├── [6.7K]  Cluster_11267.txt
        ├── [ 10K]  Cluster_13141.ps.pdf
        ├── [ 20K]  Cluster_13141.txt
        ├── [ 11K]  Cluster_13225.ps.pdf
        ├── [ 37K]  Cluster_13225.txt
        ├── [8.2K]  Cluster_13226.ps.pdf
        ├── [ 17K]  Cluster_13226.txt
        ├── [9.0K]  Cluster_13708.ps.pdf
        ├── [ 29K]  Cluster_13708.txt
        ├── [9.4K]  Cluster_14095.ps.pdf
        ├── [ 33K]  Cluster_14095.txt
        ├── [9.3K]  Cluster_14096.ps.pdf
        ├── [ 52K]  Cluster_14096.txt
        ├── [8.9K]  Cluster_15325.ps.pdf
        ├── [5.9K]  Cluster_15325.txt
        ├── [9.7K]  Cluster_15635.ps.pdf
        ├── [ 18K]  Cluster_15635.txt
        ├── [8.3K]  Cluster_15636.ps.pdf
        ├── [4.8K]  Cluster_15636.txt
        ├── [ 12K]  Cluster_15637.ps.pdf
        ├── [ 19K]  Cluster_15637.txt
        ├── [8.6K]  Cluster_15638.ps.pdf
        ├── [ 23K]  Cluster_15638.txt
        ├── [ 11K]  Cluster_15639.ps.pdf
        ├── [ 37K]  Cluster_15639.txt
        ├── [9.1K]  Cluster_16249.ps.pdf
        ├── [4.1K]  Cluster_16249.txt
        ├── [9.3K]  Cluster_16250.ps.pdf
        ├── [3.5K]  Cluster_16250.txt
        ├── [9.1K]  Cluster_17619.ps.pdf
        ├── [6.1K]  Cluster_17619.txt
        ├── [9.0K]  Cluster_1785.ps.pdf
        ├── [ 16K]  Cluster_1785.txt
        ├── [8.4K]  Cluster_19246.ps.pdf
        ├── [6.2K]  Cluster_19246.txt
        ├── [8.5K]  Cluster_20177.ps.pdf
        ├── [9.0K]  Cluster_20177.txt
        ├── [ 10K]  Cluster_2042.ps.pdf
        ├── [ 54K]  Cluster_2042.txt
        ├── [8.4K]  Cluster_22161.ps.pdf
        ├── [ 41K]  Cluster_22161.txt
        ├── [9.5K]  Cluster_22586.ps.pdf
        ├── [ 44K]  Cluster_22586.txt
        ├── [7.7K]  Cluster_23993.ps.pdf
        ├── [ 13K]  Cluster_23993.txt
        ├── [8.6K]  Cluster_24004.ps.pdf
        ├── [ 12K]  Cluster_24004.txt
        ├── [9.0K]  Cluster_24617.ps.pdf
        ├── [7.5K]  Cluster_24617.txt
        ├── [8.4K]  Cluster_25443.ps.pdf
        ├── [ 21K]  Cluster_25443.txt
        ├── [8.3K]  Cluster_2646.ps.pdf
        ├── [ 21K]  Cluster_2646.txt
        ├── [9.2K]  Cluster_29018.ps.pdf
        ├── [ 29K]  Cluster_29018.txt
        ├── [ 12K]  Cluster_31844.ps.pdf
        ├── [ 26K]  Cluster_31844.txt
        ├── [7.5K]  Cluster_32917.ps.pdf
        ├── [4.5K]  Cluster_32917.txt
        ├── [9.3K]  Cluster_32918.ps.pdf
        ├── [ 25K]  Cluster_32918.txt
        ├── [7.5K]  Cluster_32919.ps.pdf
        ├── [3.4K]  Cluster_32919.txt
        ├── [9.4K]  Cluster_3720.ps.pdf
        ├── [ 10K]  Cluster_3720.txt
        ├── [8.2K]  Cluster_4396.ps.pdf
        ├── [2.2K]  Cluster_4396.txt
        ├── [8.0K]  Cluster_5150.ps.pdf
        ├── [2.1K]  Cluster_5150.txt
        ├── [7.5K]  Cluster_9285.ps.pdf
        ├── [5.6K]  Cluster_9285.txt
        ├── [ 11K]  Cluster_9399.ps.pdf
        └── [5.4K]  Cluster_9399.txt
2 directories, 144 files