Extract mature miRNAs identified with matches to miRBase by ShortStack in 13.2.1-Apul-sRNAseq-ShortStack-31bp-fastp-merged-cnidarian_miRBase.Rmd to FastA.
This allows usage of Bash variables across R Markdown chunks.
{
echo "#### Assign Variables ####"
echo ""
echo "# Trimmed FastQ naming pattern"
echo "# Data directories"
echo 'export deep_dive_dir=/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive'
echo 'export shortstack_dir="${deep_dive_dir}/D-Apul/output/13.2.1-Apul-sRNAseq-ShortStack-31bp-fastp-merged-cnidarian_miRBase/ShortStack_out"'
echo 'export output_dir_top=${deep_dive_dir}/D-Apul/output/13.2.1.1-Apul-sRNAseq-ShortStack-FastA-extraction'
echo ""
echo "# Input/Output files"
echo 'export output_fasta="mature-miRBase-matches.fasta"'
echo 'export shortstack_fasta="mir.fasta"'
echo 'export shortstack_fasta_index="mir.fasta.fai"'
echo 'export shortstack_fixed_fasta="mir-coords-fixed.fasta"'
echo 'export shortstack_fixed_fasta_index="mir-coords-fixed.fasta.fai"'
echo 'export shortstack_results_file="Results.txt"'
echo 'export regions="mature-miRBase-regions.txt"'
echo ""
echo "# Set number of CPUs to use"
echo 'export threads=40'
echo ""
echo "# Programs"
echo 'export samtools=/home/shared/samtools-1.12/samtools'
} > .bashvars
cat .bashvars
#### Assign Variables ####
# Trimmed FastQ naming pattern
# Data directories
export deep_dive_dir=/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive
export shortstack_dir="${deep_dive_dir}/D-Apul/output/13.2.1-Apul-sRNAseq-ShortStack-31bp-fastp-merged-cnidarian_miRBase/ShortStack_out"
export output_dir_top=${deep_dive_dir}/D-Apul/output/13.2.1.1-Apul-sRNAseq-ShortStack-FastA-extraction
# Input/Output files
export output_fasta="mature-miRBase-matches.fasta"
export shortstack_fasta="mir.fasta"
export shortstack_fasta_index="mir.fasta.fai"
export shortstack_fixed_fasta="mir-coords-fixed.fasta"
export shortstack_fixed_fasta_index="mir-coords-fixed.fasta.fai"
export shortstack_results_file="Results.txt"
export regions="mature-miRBase-regions.txt"
# Set number of CPUs to use
export threads=40
# Programs
export samtools=/home/shared/samtools-1.12/samtools
Results.txt
# Load bash variables into memory
source .bashvars
head "${shortstack_dir}/${shortstack_results_file}" | column -t
Locus Name Chrom Start End Length Reads DistinctSequences FracTop Strand MajorRNA MajorRNAReads Short Long 21 22 23 24 DicerCall MIRNA known_miRNAs
NC_058066.1:152483-152910 Cluster_1 NC_058066.1 152483 152910 428 140 32 0.05 - UAAGUACUUUAUCAACUAACUCUAGGCA 75 1 130 0 2 0 7 N N NA
NC_058066.1:161064-161674 Cluster_2 NC_058066.1 161064 161674 611 549 247 0.2987249544626594 . UUUUAGCCUAGUGCGGGUUUCCAGACGU 43 25 479 16 4 4 21 N N NA
NC_058066.1:172073-172496 Cluster_3 NC_058066.1 172073 172496 424 105 40 0.12380952380952381 - GCGAUUAUUAACGGCUGGAACGACAGGCGA 16 1 88 1 1 0 14 N N NA
NC_058066.1:203242-203651 Cluster_4 NC_058066.1 203242 203651 410 100 45 0.56 . UUCUGACUCUAUUAGCAACGAAGACUUU 26 1 96 0 1 0 2 N N NA
NC_058066.1:204535-205150 Cluster_5 NC_058066.1 204535 205150 616 313 157 0.7763578274760383 . UCCCAACACGUCUAGACUGUACAAUUUCU 32 3 304 1 1 2 2 N N NA
NC_058066.1:205745-206966 Cluster_6 NC_058066.1 205745 206966 1222 1930 416 0.35544041450777203 . CAAAAGAGCGGACAAAAUAGUCGACAGAUU 716 3 1882 5 10 7 23 N N NA
NC_058066.1:210841-211344 Cluster_7 NC_058066.1 210841 211344 504 1247 333 0.7457898957497995 . UAAUACUUGUAGUGAAGGUUCAAUCUCGA 95 10 1133 7 7 20 70 N N NA
NC_058066.1:349655-351297 Cluster_8 NC_058066.1 349655 351297 1643 3279 1165 0.8127477889600488 + UCAGCUUGGAAAUGACAGCUUUUGACGU 255 27 3141 10 22 17 62 N N NA
NC_058066.1:351491-353439 Cluster_9 NC_058066.1 351491 353439 1949 8889 1615 0.4114073574080324 . UUUCAAAUCAAAGAUCUUCGCAACGAUGA 780 82 8503 34 34 114 122 N N NA
Column 1: Region of miRNA match
Column 20: ShortStack miRNA? Y/N
Column 21: Match to miRBase? NA or miRBase match
# Load bash variables into memory
source .bashvars
awk '{print $1"\t"$20"\t"$21}' "${shortstack_dir}/${shortstack_results_file}" | head | column -t
Locus MIRNA known_miRNAs
NC_058066.1:152483-152910 N NA
NC_058066.1:161064-161674 N NA
NC_058066.1:172073-172496 N NA
NC_058066.1:203242-203651 N NA
NC_058066.1:204535-205150 N NA
NC_058066.1:205745-206966 N NA
NC_058066.1:210841-211344 N NA
NC_058066.1:349655-351297 N NA
NC_058066.1:351491-353439 N NA
# Load bash variables into memory
source .bashvars
awk '$20 == "Y" && $21 != "NA" {print $1"\t"$20"\t"$21}' "${shortstack_dir}/${shortstack_results_file}" | head | column -t
echo ""
echo "------------------------------------------"
echo ""
echo "Number of miRNAs matching miRBase:"
awk '$20 == "Y" && $21 != "NA" {print $1"\t"$20"\t"$21}' "${shortstack_dir}/${shortstack_results_file}" | wc -l
NC_058066.1:12757125-12757218 Y ami-miR-P-novel-3-3p_Acropora_millepora_Praher_et_al._2021_NA;adi-ami-miR-P-novel-3-3p_Acropora_digitifera__Praher_et_al._2021_NA
NC_058066.1:20346227-20346321 Y ami-miR-P-novel-1-3p_Acropora_millepora_Praher_et_al._2021_NA;Adi-Mir-Novel-1_3p_Acropora_digitifera_Gajigan_&_Conaco_2017_nve-miR-9437;adi-miR-P-novel-1-3p_Acropora_digitifera__Praher_et_al._2021_NA
NC_058067.1:5656192-5656286 Y spi-mir-temp-25_Stylophora_pistillata_Liew_et_al._2014_Considered_bona_fideas_close_match_to_nve-_and_hma-miR-2022.;eca-nve-F-miR-2022_Edwardsiella_carnea_Praher_et_al._2021_Transcriptome-level;Adi-Mir-2022_3p_Acropora_digitifera_Gajigan_&_Conaco_2017_nve-miR-2022-3p;_nve-miR-2022-3p;_spi-miR-temp-25;ami-nve-F-miR-2022-3p_Acropora_millepora_Praher_et_al._2021_NA
NC_058067.1:16118215-16118311 Y ami-miR-P-novel-4-5p_Acropora_millepora_Praher_et_al._2021_NA
NC_058067.1:25838831-25838927 Y ami-miR-P-novel-5-5p_Acropora_millepora_Praher_et_al._2021_NA
NC_058068.1:3756500-3756594 Y ami-Adi-MiR-G-Novel-5-3p_Acropora_millepora_Praher_et_al._2021_NA;Adi-Mir-Novel-5_3p_Acropora_digitifera_Gajigan_&_Conaco_2017_NA;Adi-MiR-G-Novel-5_3p_Acropora_digitifera__Praher_et_al._2021_NA
NC_058069.1:1868134-1868229 Y adi-miR-P-novel-4-3p_Acropora_digitifera__Praher_et_al._2021_NA
NC_058069.1:2018489-2018586 Y ami-miR-P-novel-17-3p_Acropora_millepora_Praher_et_al._2021_NA
NC_058069.1:21012147-21012242 Y ami-nve-F-miR-2025-3p_Acropora_millepora_Praher_et_al._2021_NA;Adi-Mir-2025_3p_Acropora_digitifera_Gajigan_&_Conaco_2017_nve-miR-2025-3p;_nve-miR-2025-3p;adi-nve-F-miR-2025_Acropora_digitifera__Praher_et_al._2021_NA
NC_058070.1:2184956-2185050 Y Adi-Mir-9425_5p_Acropora_digitifera_Gajigan_&_Conaco_2017_nve-miR-9425;_nve-miR-9425;ami-nve-F-miR-9425-5p_Acropora_millepora_Praher_et_al._2021_NA;nve-miR-9425;miR-9425_Nematostella_vectensis_Moran_et_al._2014_NA
------------------------------------------
Number of miRNAs matching miRBase:
24
# Load bash variables into memory
source .bashvars
grep "^>" "${shortstack_dir}/${shortstack_fasta}" | head
>Cluster_316::NC_058066.1:12757124-12757218(-)
>Cluster_316.mature::NC_058066.1:12757146-12757168(-)
>Cluster_316.star::NC_058066.1:12757176-12757198(-)
>Cluster_514::NC_058066.1:20088629-20088720(+)
>Cluster_514.mature::NC_058066.1:20088678-20088700(+)
>Cluster_514.star::NC_058066.1:20088649-20088671(+)
>Cluster_548::NC_058066.1:20346226-20346321(-)
>Cluster_548.mature::NC_058066.1:20346248-20346271(-)
>Cluster_548.star::NC_058066.1:20346278-20346301(-)
>Cluster_1506::NC_058067.1:5656191-5656286(-)
Needed, due to bug in code (GitHub Issue) which incorrectly calculates the starting coordinates in the FastA output. All other files where start/stop coordinates are conveyed are correct.
The incorrect starting coordinates cause an issue in downstream manipulation, because the FastA headers need to match the ShortStack results file.
# Load bash variables into memory
source .bashvars
awk '
/^>/ {
# Split the line into main parts based on "::" delimiter
split($0, main_parts, "::")
# Extract the coordinate part and strand information separately
coordinates_strand = main_parts[2]
split(coordinates_strand, coord_parts, "[:-]")
# Determine if the strand information is present and extract it
strand = ""
if (substr(coordinates_strand, length(coordinates_strand)) ~ /[\(\)\-\+]/) {
strand = substr(coordinates_strand, length(coordinates_strand) - 1)
coordinates_strand = substr(coordinates_strand, 1, length(coordinates_strand) - 2)
split(coordinates_strand, coord_parts, "[:-]")
}
# Increment the starting coordinate by 1
new_start = coord_parts[2] + 1
# Reconstruct the description line with the new starting coordinate
new_description = main_parts[1] "::" coord_parts[1] ":" new_start "-" coord_parts[3] strand
# Print the modified description line
print new_description
# Skip to the next line to process the sequence line
next
}
# For sequence lines, print them as-is
{
print
}
' "${shortstack_dir}/${shortstack_fasta}" \
> "${shortstack_dir}/${shortstack_fixed_fasta}"
diff "${shortstack_dir}/${shortstack_fasta}" \
"${shortstack_dir}/${shortstack_fixed_fasta}" \
| head
1c1
< >Cluster_316::NC_058066.1:12757124-12757218(-)
---
> >Cluster_316::NC_058066.1:12757125-12757218(-)
3c3
< >Cluster_316.mature::NC_058066.1:12757146-12757168(-)
---
> >Cluster_316.mature::NC_058066.1:12757147-12757168(-)
5c5
< >Cluster_316.star::NC_058066.1:12757176-12757198(-)
samtools
# Load bash variables into memory
source .bashvars
${samtools} faidx "${shortstack_dir}/${shortstack_fixed_fasta}"
head "${shortstack_dir}/${shortstack_fixed_fasta_index}"
Cluster_316::NC_058066.1:12757125-12757218(-) 94 47 94 95
Cluster_316.mature::NC_058066.1:12757147-12757168(-) 22 196 22 23
Cluster_316.star::NC_058066.1:12757177-12757198(-) 22 271 22 23
Cluster_514::NC_058066.1:20088630-20088720(+) 91 341 91 92
Cluster_514.mature::NC_058066.1:20088679-20088700(+) 22 487 22 23
Cluster_514.star::NC_058066.1:20088650-20088671(+) 22 562 22 23
Cluster_548::NC_058066.1:20346227-20346321(-) 95 632 95 96
Cluster_548.mature::NC_058066.1:20346249-20346271(-) 23 782 23 24
Cluster_548.star::NC_058066.1:20346279-20346301(-) 23 858 23 24
Cluster_1506::NC_058067.1:5656192-5656286(-) 95 928 95 96
# Load bash variables into memory
source .bashvars
# Make output directory, if it doesn't exist
mkdir --parents "${output_dir_top}"
{
awk '$20 == "Y" && $21 != "NA" {print $2}' "${shortstack_dir}/${shortstack_results_file}" \
| grep --fixed-strings --file - "${shortstack_dir}/${shortstack_fixed_fasta_index}" \
| awk '{print $1}'
} \
> "${output_dir_top}/${regions}"
head "${output_dir_top}/${regions}"
Cluster_316::NC_058066.1:12757125-12757218(-)
Cluster_316.mature::NC_058066.1:12757147-12757168(-)
Cluster_316.star::NC_058066.1:12757177-12757198(-)
Cluster_548::NC_058066.1:20346227-20346321(-)
Cluster_548.mature::NC_058066.1:20346249-20346271(-)
Cluster_548.star::NC_058066.1:20346279-20346301(-)
Cluster_1506::NC_058067.1:5656192-5656286(-)
Cluster_1506.mature::NC_058067.1:5656214-5656236(-)
Cluster_1506.star::NC_058067.1:5656244-5656266(-)
Cluster_1900::NC_058067.1:16118215-16118311(-)
# Load bash variables into memory
source .bashvars
${samtools} faidx "${shortstack_dir}/${shortstack_fixed_fasta}" \
--region-file "${output_dir_top}/${regions}" \
> "${output_dir_top}/${output_fasta}"
head "${output_dir_top}/${output_fasta}"
echo ""
echo ""
echo ""
echo "Number of FastA sequences:"
grep "^>" --count "${output_dir_top}/${output_fasta}"
>Cluster_316::NC_058066.1:12757125-12757218(-)
ATGCTTTACTCCTTTGGGAGGGAGGTTAGTGCAGAGGTCATCGTTATTGATGATCTCTGC
AATAGCCTGCCTCCCAAAGGAGTTCTACTAGTCC
>Cluster_316.mature::NC_058066.1:12757147-12757168(-)
TGATCTCTGCAATAGCCTGCCT
>Cluster_316.star::NC_058066.1:12757177-12757198(-)
GGAGGTTAGTGCAGAGGTCATC
>Cluster_548::NC_058066.1:20346227-20346321(-)
TCATAAGGAAGGTACGGTTTCTTCGTTTATTCACTCGTTCATATTTATTATTAACGAGTA
GATAAATGAAGAGATCTTATCTTTGTTGGAAAAGA
Number of FastA sequences:
72