Use ShortStack (Axtell 2013; Shahid and Axtell 2014; Johnson et al. 2016)to perform alignment of sRNAseq data and annotation of sRNA-producing genes.
This is the same ShortStack analysis as seen in 13.1-Pmea-sRNAseq-ShortStack-R1-reads.Rmd, but this analysis uses a customized miRBase database, created by Jill Ashley, which includes published cnidarian miRNAs:
The P.meandrina genome will be used as the reference genome.
Inputs:
Requires trimmed sRNAseq files generated by 08.1-Pmea-sRNAseq-trimming-R1-only.Rmd
*fastp-adapters-polyG-31bp-merged.fq.gz
P.meandrina genome FastA. See 12-Pmea-sRNAseq-MirMachine.Rmd for download info if needed.
Outputs:
Software requirements:
Replace with name of your ShortStack environment and the path to the corresponding conda installation (find this after you’ve activated the environment).
E.g.
# Activate environment
conda activate ShortStack4_env
# Find conda path
which conda
shortstack_conda_env_name <- c("ShortStack-4.0.3_env")
shortstack_cond_path <- c("/home/sam/programs/mambaforge/condabin/conda")
This allows usage of Bash variables across R Markdown chunks.
{
echo "#### Assign Variables ####"
echo ""
echo "# Trimmed FastQ naming pattern"
echo "export trimmed_fastqs_pattern='*fastp-adapters-polyG-31bp-merged.fq.gz'"
echo "# Data directories"
echo 'export deep_dive_dir=/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive'
echo 'export deep_dive_data_dir="${deep_dive_dir}/data"'
echo 'export output_dir_top=${deep_dive_dir}/F-Pmea/output/13.2.1-Pmea-sRNAseq-ShortStack-31bp-fastp-merged-cnidarian_miRBase'
echo 'export trimmed_fastqs_dir="${deep_dive_dir}/F-Pmea/output/08.2-Pmea-sRNAseq-trimming-31bp-fastp-merged/trimmed-reads"'
echo ""
echo "# Input/Output files"
echo 'export genome_fasta_dir=${deep_dive_dir}/F-Pmea/data'
echo 'export genome_fasta_name="Pocillopora_meandrina_HIv1.assembly.fasta"'
echo 'export shortstack_genome_fasta_name="Pocillopora_meandrina_HIv1.assembly.fa"'
echo 'export mirbase_mature_fasta_version=cnidarian_miRNAs.fasta'
echo 'export genome_fasta="${genome_fasta_dir}/${shortstack_genome_fasta_name}"'
echo ""
echo "# Set number of CPUs to use"
echo 'export threads=46'
echo ""
echo "# Initialize arrays"
echo 'export trimmed_fastqs_array=()'
} > .bashvars
cat .bashvars
#### Assign Variables ####
# Trimmed FastQ naming pattern
export trimmed_fastqs_pattern='*fastp-adapters-polyG-31bp-merged.fq.gz'
# Data directories
export deep_dive_dir=/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive
export deep_dive_data_dir="${deep_dive_dir}/data"
export output_dir_top=${deep_dive_dir}/F-Pmea/output/13.2.1-Pmea-sRNAseq-ShortStack-31bp-fastp-merged-cnidarian_miRBase
export trimmed_fastqs_dir="${deep_dive_dir}/F-Pmea/output/08.2-Pmea-sRNAseq-trimming-31bp-fastp-merged/trimmed-reads"
# Input/Output files
export genome_fasta_dir=${deep_dive_dir}/F-Pmea/data
export genome_fasta_name="Pocillopora_meandrina_HIv1.assembly.fasta"
export shortstack_genome_fasta_name="Pocillopora_meandrina_HIv1.assembly.fa"
export mirbase_mature_fasta_version=cnidarian_miRNAs.fasta
export genome_fasta="${genome_fasta_dir}/${shortstack_genome_fasta_name}"
# Set number of CPUs to use
export threads=46
# Initialize arrays
export trimmed_fastqs_array=()
If this is successful, the first line of output should show that the Python being used is the one in your [ShortStack](https://github.com/MikeAxtell/ShortStack conda environment path.
E.g.
python: /home/sam/programs/mambaforge/envs/mirmachine_env/bin/python
use_condaenv(condaenv = shortstack_conda_env_name, conda = shortstack_cond_path)
# Check successful env loading
py_config()
python: /home/sam/programs/mambaforge/envs/ShortStack-4.0.3_env/bin/python
libpython: /home/sam/programs/mambaforge/envs/ShortStack-4.0.3_env/lib/libpython3.10.so
pythonhome: /home/sam/programs/mambaforge/envs/ShortStack-4.0.3_env:/home/sam/programs/mambaforge/envs/ShortStack-4.0.3_env
version: 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0]
numpy: /home/sam/programs/mambaforge/envs/ShortStack-4.0.3_env/lib/python3.10/site-packages/numpy
numpy_version: 1.26.4
NOTE: Python version was forced by use_python() function
# Load bash variables into memory
source .bashvars
# Check for FastA file first
# Then create rename file if doesn't exist
if [ -f "${genome_fasta_dir}/${shortstack_genome_fasta_name}" ]; then
echo "${genome_fasta_dir}/${shortstack_genome_fasta_name} already exists. Nothing to do."
echo ""
else
# Copy genome FastA to ShortStack-compatible filename (ending with .fa)
cp ${genome_fasta_dir}/${genome_fasta_name} ${genome_fasta_dir}/${shortstack_genome_fasta_name}
fi
# Confirm
ls -lh ${genome_fasta_dir}/${shortstack_genome_fasta_name}
/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/F-Pmea/data/Pocillopora_meandrina_HIv1.assembly.fa already exists. Nothing to do.
-rw-r--r-- 1 sam sam 360M Feb 16 10:28 /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/F-Pmea/data/Pocillopora_meandrina_HIv1.assembly.fa
Uses the --dn_mirna
option to identify miRNAs in the genome, without relying on the --known_miRNAs
.
This part of the code redirects the output of time
to the end of shortstack.log
file.
; } \ 2>> ${output_dir_top}/shortstack.log
# Load bash variables into memory
source .bashvars
# Make output directory, if it doesn't exist
mkdir --parents "${output_dir_top}"
# Create array of trimmed FastQs
trimmed_fastqs_array=(${trimmed_fastqs_dir}/${trimmed_fastqs_pattern})
# Pass array contents to new variable as space-delimited list
trimmed_fastqs_list=$(echo "${trimmed_fastqs_array[*]}")
###### Run ShortStack ######
{ time \
ShortStack \
--genomefile "${genome_fasta}" \
--readfile ${trimmed_fastqs_list} \
--known_miRNAs ${deep_dive_data_dir}/${mirbase_mature_fasta_version} \
--dn_mirna \
--threads ${threads} \
--outdir ${output_dir_top}/ShortStack_out \
&> ${output_dir_top}/shortstack.log ; } \
2>> ${output_dir_top}/shortstack.log
# Load bash variables into memory
source .bashvars
tail -n 3 ${output_dir_top}/shortstack.log \
| grep "real" \
| awk '{print "ShortStack runtime:" "\t" $2}'
ShortStack runtime: 44m18.602s
# Load bash variables into memory
source .bashvars
tail -n 25 ${output_dir_top}/shortstack.log
Writing final files
Found a total of 36 MIRNA loci
Non-MIRNA loci by DicerCall:
N 7088
22 36
23 33
21 16
24 14
Creating visualizations of microRNA loci with strucVis
<<< WARNING >>>
Do not rely on these results alone to annotate new MIRNA loci!
The false positive rate for de novo MIRNA identification is low, but NOT ZERO
Insepct each mirna locus, especially the strucVis output, and see
https://doi.org/10.1105/tpc.17.00851 , https://doi.org/10.1093/nar/gky1141
Fri 16 Feb 2024 15:40:13 -0800 PST
Run Completed!
real 44m18.602s
user 805m46.145s
sys 284m7.125s
ShortStack identified 36 miRNAs.
Results.txt
# Load bash variables into memory
source .bashvars
head ${output_dir_top}/ShortStack_out/Results.txt
echo ""
echo "----------------------------------------------------------"
echo ""
echo "Nummber of potential loci:"
awk '(NR>1)' ${output_dir_top}/ShortStack_out/Results.txt | wc -l
Locus Name Chrom Start End Length Reads DistinctSequences FracTop Strand MajorRNA MajorRNAReads Short Long 21 22 23 24 DicerCall MIRNA known_miRNAs
Pocillopora_meandrina_HIv1___Sc0000000:9092-9521 Cluster_1 Pocillopora_meandrina_HIv1___Sc0000000 9092 9521 430 10746 359 1.0 + GGGGGUAUAGCUCAGUGGUAGA 3828 1458 3708 639 4351 170 420 N N NA
Pocillopora_meandrina_HIv1___Sc0000000:53578-53997 Cluster_2 Pocillopora_meandrina_HIv1___Sc0000000 53578 53997 420 286 13 0.9965034965034965 + GCCUAAGUUGCUUGGAACA 137 284 2 0 0 0 0 N N NA
Pocillopora_meandrina_HIv1___Sc0000000:150243-150718 Cluster_3 Pocillopora_meandrina_HIv1___Sc0000000 150243 150718 476 2549 247 0.0 - UGGCUAUGAUGAAAAUGACU 335 849 380 634 376 139 171 N N NA
Pocillopora_meandrina_HIv1___Sc0000000:173728-174150 Cluster_4 Pocillopora_meandrina_HIv1___Sc0000000 173728 174150 423 1257 65 0.9968178202068417 + UUUGAUUGCUGUGAUCUGGUUG 432 106 2 39 636 444 30 22 N apa-mir-2050_Exaiptasia_pallida_Baumgarten_et_al._2017_miR-2050;_Nve;_Spis;_Adi
Pocillopora_meandrina_HIv1___Sc0000000:187562-188076 Cluster_5 Pocillopora_meandrina_HIv1___Sc0000000 187562 188076 515 182 36 0.42857142857142855 . AUAAAUGUCACUACAAGAAACCUGAAAUCGU 25 0 174 1 1 2 4 N N NA
Pocillopora_meandrina_HIv1___Sc0000000:485730-486254 Cluster_6 Pocillopora_meandrina_HIv1___Sc0000000 485730 486254 525 286 127 1.0 + GAUGGGUGUUAUUACUCCUCAGACAGAC 48 66 183 7 11 3 16 N N NA
Pocillopora_meandrina_HIv1___Sc0000000:496020-496432 Cluster_7 Pocillopora_meandrina_HIv1___Sc0000000 496020 496432 413 72 24 1.0 + AUGUAGUCGAGCAAAGUCCAUGUGGACGA 27 0 66 2 1 1 2 N N NA
Pocillopora_meandrina_HIv1___Sc0000000:525310-527341 Cluster_8 Pocillopora_meandrina_HIv1___Sc0000000 525310 527341 2032 14737 2989 0.17866594286489787 - UUUUCGUCACUUUCUUCAGCCUCAGAGU 966 140 13633 54 106 304 500 N N NA
Pocillopora_meandrina_HIv1___Sc0000000:541262-541723 Cluster_9 Pocillopora_meandrina_HIv1___Sc0000000 541262 541723 462 756 142 0.0873015873015873 - UUGGACGAAAUUUCGAGGUUCACACUCGUU 91 1 750 0 4 1 0 N N NA
----------------------------------------------------------
Nummber of potential loci:
7223
Column 20 of the Results.txt
file identifies if a cluster is a miRNA or not (Y
or N
).
# Load bash variables into memory
source .bashvars
echo "Number of loci characterized as miRNA:"
awk '$20=="Y" {print $0}' ${output_dir_top}/ShortStack_out/Results.txt \
| wc -l
echo ""
echo "----------------------------------------------------------"
echo ""
echo "Number of loci _not_ characterized as miRNA:"
awk '$20=="N" {print $0}' ${output_dir_top}/ShortStack_out/Results.txt \
| wc -l
Number of loci characterized as miRNA:
36
----------------------------------------------------------
Number of loci _not_ characterized as miRNA:
7187
Column 21 of the Results.txt
file identifies if a cluster aligned to a known miRNA (miRBase) or not (Y
or NA
).
Since there are no miRNAs, the following code will not print any output.
The echo
command after the awk
command is simply there to prove that the chunk executed.
# Load bash variables into memory
source .bashvars
echo "Number of loci matching miRBase miRNAs:"
awk '$21!="NA" {print $0}' ${output_dir_top}/ShortStack_out/Results.txt \
| wc -l
echo ""
echo "----------------------------------------------------------"
echo ""
echo "Number of loci _not_ matching miRBase miRNAs:"
awk '$21=="NA" {print $0}' ${output_dir_top}/ShortStack_out/Results.txt \
| wc -l
Number of loci matching miRBase miRNAs:
30
----------------------------------------------------------
Number of loci _not_ matching miRBase miRNAs:
7194
Although there are loci with matches to miRBase miRNAs, ShortStack did not annotate these clusters as miRNAs likely because they do not also match secondary structure criteria.
Many of these are large (by GitHub standards) BAM files, so will not be added to the repo.
Additionally, it’s unlikely we’ll utilize most of the other files (bigwig) generated by ShortStack.
# Load bash variables into memory
source .bashvars
tree -h ${output_dir_top}/
/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/F-Pmea/output/13.2.1-Pmea-sRNAseq-ShortStack-31bp-fastp-merged-cnidarian_miRBase/
├── [ 22K] shortstack.log
└── [ 16K] ShortStack_out
├── [ 31K] alignment_details.tsv
├── [606K] Counts.txt
├── [ 20K] known_miRNAs.gff3
├── [ 38K] known_miRNAs_unaligned.fasta
├── [6.9M] merged_alignments_21_m.bw
├── [7.1M] merged_alignments_21_p.bw
├── [6.7M] merged_alignments_22_m.bw
├── [6.8M] merged_alignments_22_p.bw
├── [ 13M] merged_alignments_23-24_m.bw
├── [ 13M] merged_alignments_23-24_p.bw
├── [1.0G] merged_alignments.bam
├── [152K] merged_alignments.bam.csi
├── [ 45M] merged_alignments_other_m.bw
├── [ 45M] merged_alignments_other_p.bw
├── [ 23M] merged_alignments_sRNA-POC-47-S1-TP2-fastp-adapters-polyG-31bp-merged.bw
├── [ 30M] merged_alignments_sRNA-POC-48-S1-TP2-fastp-adapters-polyG-31bp-merged.bw
├── [ 24M] merged_alignments_sRNA-POC-50-S1-TP2-fastp-adapters-polyG-31bp-merged.bw
├── [ 38M] merged_alignments_sRNA-POC-53-S1-TP2-fastp-adapters-polyG-31bp-merged.bw
├── [ 49M] merged_alignments_sRNA-POC-57-S1-TP2-fastp-adapters-polyG-31bp-merged.bw
├── [ 13K] mir.fasta
├── [919K] Results.gff3
├── [1.4M] Results.txt
├── [179M] sRNA-POC-47-S1-TP2-fastp-adapters-polyG-31bp-merged.bam
├── [168K] sRNA-POC-47-S1-TP2-fastp-adapters-polyG-31bp-merged.bam.csi
├── [192M] sRNA-POC-48-S1-TP2-fastp-adapters-polyG-31bp-merged.bam
├── [169K] sRNA-POC-48-S1-TP2-fastp-adapters-polyG-31bp-merged.bam.csi
├── [174M] sRNA-POC-50-S1-TP2-fastp-adapters-polyG-31bp-merged.bam
├── [169K] sRNA-POC-50-S1-TP2-fastp-adapters-polyG-31bp-merged.bam.csi
├── [231M] sRNA-POC-53-S1-TP2-fastp-adapters-polyG-31bp-merged.bam
├── [169K] sRNA-POC-53-S1-TP2-fastp-adapters-polyG-31bp-merged.bam.csi
├── [209M] sRNA-POC-57-S1-TP2-fastp-adapters-polyG-31bp-merged.bam
├── [160K] sRNA-POC-57-S1-TP2-fastp-adapters-polyG-31bp-merged.bam.csi
└── [4.0K] strucVis
├── [ 12K] Cluster_1011.ps
├── [ 35K] Cluster_1011.txt
├── [ 11K] Cluster_1066.ps
├── [3.6K] Cluster_1066.txt
├── [ 11K] Cluster_1078.ps
├── [ 10K] Cluster_1078.txt
├── [ 11K] Cluster_1116.ps
├── [7.4K] Cluster_1116.txt
├── [ 12K] Cluster_1286.ps
├── [ 25K] Cluster_1286.txt
├── [ 13K] Cluster_1292.ps
├── [ 64K] Cluster_1292.txt
├── [ 13K] Cluster_1794.ps
├── [ 41K] Cluster_1794.txt
├── [ 12K] Cluster_1933.ps
├── [ 35K] Cluster_1933.txt
├── [ 12K] Cluster_1951.ps
├── [ 39K] Cluster_1951.txt
├── [ 12K] Cluster_1952.ps
├── [ 30K] Cluster_1952.txt
├── [ 12K] Cluster_21.ps
├── [ 30K] Cluster_21.txt
├── [ 12K] Cluster_2788.ps
├── [8.4K] Cluster_2788.txt
├── [ 12K] Cluster_2831.ps
├── [5.2K] Cluster_2831.txt
├── [ 11K] Cluster_2833.ps
├── [7.4K] Cluster_2833.txt
├── [ 11K] Cluster_2853.ps
├── [ 14K] Cluster_2853.txt
├── [ 11K] Cluster_2971.ps
├── [5.6K] Cluster_2971.txt
├── [ 11K] Cluster_3389.ps
├── [ 30K] Cluster_3389.txt
├── [ 12K] Cluster_361.ps
├── [ 41K] Cluster_361.txt
├── [ 11K] Cluster_3663.ps
├── [ 14K] Cluster_3663.txt
├── [ 11K] Cluster_37.ps
├── [3.7K] Cluster_37.txt
├── [ 12K] Cluster_4049.ps
├── [7.7K] Cluster_4049.txt
├── [ 12K] Cluster_4050.ps
├── [5.1K] Cluster_4050.txt
├── [ 12K] Cluster_4130.ps
├── [ 14K] Cluster_4130.txt
├── [ 12K] Cluster_4455.ps
├── [ 35K] Cluster_4455.txt
├── [ 12K] Cluster_4457.ps
├── [ 53K] Cluster_4457.txt
├── [ 12K] Cluster_4458.ps
├── [ 24K] Cluster_4458.txt
├── [ 12K] Cluster_4459.ps
├── [ 30K] Cluster_4459.txt
├── [ 12K] Cluster_4586.ps
├── [ 64K] Cluster_4586.txt
├── [ 12K] Cluster_4768.ps
├── [7.0K] Cluster_4768.txt
├── [ 12K] Cluster_4838.ps
├── [ 31K] Cluster_4838.txt
├── [ 11K] Cluster_5273.ps
├── [4.5K] Cluster_5273.txt
├── [ 12K] Cluster_5633.ps
├── [ 16K] Cluster_5633.txt
├── [ 12K] Cluster_5761.ps
├── [ 17K] Cluster_5761.txt
├── [ 12K] Cluster_6425.ps
├── [ 21K] Cluster_6425.txt
├── [ 12K] Cluster_759.ps
├── [ 27K] Cluster_759.txt
├── [ 12K] Cluster_918.ps
└── [ 30K] Cluster_918.txt
2 directories, 105 files
We noticed that a) not all of the identified miRNAs have database matches, and b) some reads have a match in the database but are not classified as miRNAs. Let’s look at this in more depth.
Pmea_shortstack_results <- read.csv("../output/13.2.1-Pmea-sRNAseq-ShortStack-31bp-fastp-merged-cnidarian_miRBase/ShortStack_out/Results.txt", sep="\t")
# Reads identified as miRNAs (but not necessarily known)
Pmea_shortstack_results %>%
filter(MIRNA == "Y") %>%
mutate(known_miRNAs = str_sub(known_miRNAs, 1, 40)) %>%
mutate(Locus = str_sub(Locus, 20, 40)) %>%
ggplot(aes(x = reorder(Locus, Reads), y = Reads, fill = known_miRNAs)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = Reads), vjust = -0.5, position = position_dodge(width = -0.5), color = "black", size = 2.5, angle = 90) +
labs(x = "miRNA", y = "Read count",
title = "Reads identified by ShortStack as miRNAs",
fill = "Annotation") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())
ggsave("../output/13.2.1-Pmea-sRNAseq-ShortStack-31bp-fastp-merged-cnidarian_miRBase/figures/Pmea_ShortStack_miRNA_histogram.png", width = 12, height = 7, units = "in")
# Reads matched in the reference db (but not necessarily identified as miRNA)
Pmea_shortstack_results %>%
filter(!is.na(known_miRNAs)) %>%
mutate(known_miRNAs = str_sub(known_miRNAs, 1, 40)) %>%
mutate(Locus = str_sub(Locus, 20, 40)) %>%
ggplot(aes(x = reorder(Locus, Reads), y = Reads, fill = MIRNA)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = Reads), vjust = 0.5, position = position_dodge(width = -0.5), color = "black", size = 2.5, angle = 90) +
labs(x = "miRNA", y = "Read count",
title = "Reads with miRBase+cnidarian database matches",
fill = "Identified as miRNA?") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())
ggsave("../output/13.2.1-Pmea-sRNAseq-ShortStack-31bp-fastp-merged-cnidarian_miRBase/figures/Pmea_ShortStack_dbmatch_histogram.png", width = 12, height = 7, units = "in")
There’s one miRNA with a very high read count, and it’s making visualization of the rest difficult. Let’s remove it and retry visualizing the rest.
# Reads identified as miRNAs (but not necessarily known)
Pmea_shortstack_results %>%
filter(MIRNA == "Y") %>%
filter(Reads < 200000) %>%
mutate(known_miRNAs = str_sub(known_miRNAs, 1, 40)) %>%
mutate(Locus = str_sub(Locus, 20, 40)) %>%
ggplot(aes(x = reorder(Locus, Reads), y = Reads, fill = known_miRNAs)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = Reads), vjust = 0.5, hjust = 0, color = "black", size = 2.5, angle = 90) +
labs(x = "miRNA", y = "Read count",
title = "Reads identified by ShortStack as miRNAs",
fill = "Annotation") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())
ggsave("../output/13.2.1-Pmea-sRNAseq-ShortStack-31bp-fastp-merged-cnidarian_miRBase/figures/Pmea_ShortStack_miRNA_histogram_reduced.png", width = 12, height = 7, units = "in")
# Make list
mirnas <- Pmea_shortstack_results %>% filter(MIRNA == "Y") %>% pull(Locus)
matches <- Pmea_shortstack_results %>% filter(!is.na(known_miRNAs)) %>% pull(Locus)
Pmea_shortstack_vennlist <- list(
"Identified as miRNA" = mirnas,
"Database match" = matches
)
# Make venn diagrams
ggvenn(Pmea_shortstack_vennlist)
ggsave("../output/13.2.1-Pmea-sRNAseq-ShortStack-31bp-fastp-merged-cnidarian_miRBase/figures/Pmea_ShortStack_venn.png", width = 12, height = 7, units = "in")