Extract mature miRNAs identified with matches to miRBase by ShortStack in 13.2.1-Pmea-sRNAseq-ShortStack-31bp-fastp-merged-cnidarian_miRBase.Rmd to FastA.
This allows usage of Bash variables across R Markdown chunks.
{
echo "#### Assign Variables ####"
echo ""
echo "# Trimmed FastQ naming pattern"
echo "# Data directories"
echo 'export deep_dive_dir=/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive'
echo 'export shortstack_dir="${deep_dive_dir}/F-Pmea/output/13.2.1-Pmea-sRNAseq-ShortStack-31bp-fastp-merged-cnidarian_miRBase/ShortStack_out"'
echo 'export output_dir_top=${deep_dive_dir}/F-Pmea/output/13.2.1.1-Pmea-sRNAseq-ShortStack-FastA-extraction'
echo ""
echo "# Input/Output files"
echo 'export output_fasta="mature-miRBase-matches.fasta"'
echo 'export shortstack_fasta="mir.fasta"'
echo 'export shortstack_fasta_index="mir.fasta.fai"'
echo 'export shortstack_fixed_fasta="mir-coords-fixed.fasta"'
echo 'export shortstack_fixed_fasta_index="mir-coords-fixed.fasta.fai"'
echo 'export shortstack_results_file="Results.txt"'
echo 'export regions="mature-miRBase-regions.txt"'
echo ""
echo "# Set number of CPUs to use"
echo 'export threads=40'
echo ""
echo "# Programs"
echo 'export samtools=/home/shared/samtools-1.12/samtools'
} > .bashvars
cat .bashvars
#### Assign Variables ####
# Trimmed FastQ naming pattern
# Data directories
export deep_dive_dir=/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive
export shortstack_dir="${deep_dive_dir}/F-Pmea/output/13.2.1-Pmea-sRNAseq-ShortStack-31bp-fastp-merged-cnidarian_miRBase/ShortStack_out"
export output_dir_top=${deep_dive_dir}/F-Pmea/output/13.2.1.1-Pmea-sRNAseq-ShortStack-FastA-extraction
# Input/Output files
export output_fasta="mature-miRBase-matches.fasta"
export shortstack_fasta="mir.fasta"
export shortstack_fasta_index="mir.fasta.fai"
export shortstack_fixed_fasta="mir-coords-fixed.fasta"
export shortstack_fixed_fasta_index="mir-coords-fixed.fasta.fai"
export shortstack_results_file="Results.txt"
export regions="mature-miRBase-regions.txt"
# Set number of CPUs to use
export threads=40
# Programs
export samtools=/home/shared/samtools-1.12/samtools
Results.txt
# Load bash variables into memory
source .bashvars
head "${shortstack_dir}/${shortstack_results_file}" | column -t
Locus Name Chrom Start End Length Reads DistinctSequences FracTop Strand MajorRNA MajorRNAReads Short Long 21 22 23 24 DicerCall MIRNA known_miRNAs
Pocillopora_meandrina_HIv1___Sc0000000:9092-9521 Cluster_1 Pocillopora_meandrina_HIv1___Sc0000000 9092 9521 430 10813 348 0.9999075187274576 + GGGGGUAUAGCUCAGUGGUAGA 3850 1422 3739 637 4394 174 447 N N NA
Pocillopora_meandrina_HIv1___Sc0000000:53578-53997 Cluster_2 Pocillopora_meandrina_HIv1___Sc0000000 53578 53997 420 287 13 0.9965156794425087 + GCCUAAGUUGCUUGGAACA 138 285 2 0 0 0 0 N N NA
Pocillopora_meandrina_HIv1___Sc0000000:150243-150718 Cluster_3 Pocillopora_meandrina_HIv1___Sc0000000 150243 150718 476 2549 247 0.0 - UGGCUAUGAUGAAAAUGACU 335 849 380 634 376 139 171 N N NA
Pocillopora_meandrina_HIv1___Sc0000000:173728-174150 Cluster_4 Pocillopora_meandrina_HIv1___Sc0000000 173728 174150 423 1257 65 0.9968178202068417 + UUUGAUUGCUGUGAUCUGGUUG 432 106 2 39 636 444 30 22 N apa-mir-2050_Exaiptasia_pallida_Baumgarten_et_al._2017_miR-2050;_Nve;_Spis;_Adi
Pocillopora_meandrina_HIv1___Sc0000000:187562-188076 Cluster_5 Pocillopora_meandrina_HIv1___Sc0000000 187562 188076 515 185 37 0.43243243243243246 . AUAAAUGUCACUACAAGAAACCUGAAAUCGU 25 2 175 1 1 2 4 N N NA
Pocillopora_meandrina_HIv1___Sc0000000:485730-486254 Cluster_6 Pocillopora_meandrina_HIv1___Sc0000000 485730 486254 525 286 127 1.0 + GAUGGGUGUUAUUACUCCUCAGACAGAC 48 66 183 7 11 3 16 N N NA
Pocillopora_meandrina_HIv1___Sc0000000:496020-496432 Cluster_7 Pocillopora_meandrina_HIv1___Sc0000000 496020 496432 413 72 24 1.0 + AUGUAGUCGAGCAAAGUCCAUGUGGACGA 27 0 66 2 1 1 2 N N NA
Pocillopora_meandrina_HIv1___Sc0000000:525310-527341 Cluster_8 Pocillopora_meandrina_HIv1___Sc0000000 525310 527341 2032 14765 2997 0.1810362343379614 - UUUUCGUCACUUUCUUCAGCCUCAGAGU 966 140 13674 47 106 311 487 N N NA
Pocillopora_meandrina_HIv1___Sc0000000:541262-541723 Cluster_9 Pocillopora_meandrina_HIv1___Sc0000000 541262 541723 462 732 134 0.07923497267759563 - UUGGACGAAAUUUCGAGGUUCACACUCGUU 91 1 725 1 4 1 0 N N NA
Column 1: Region of miRNA match
Column 20: ShortStack miRNA? Y/N
Column 21: Match to miRBase? NA or miRBase match
# Load bash variables into memory
source .bashvars
awk '{print $1"\t"$20"\t"$21}' "${shortstack_dir}/${shortstack_results_file}" | head | column -t
Locus MIRNA known_miRNAs
Pocillopora_meandrina_HIv1___Sc0000000:9092-9521 N NA
Pocillopora_meandrina_HIv1___Sc0000000:53578-53997 N NA
Pocillopora_meandrina_HIv1___Sc0000000:150243-150718 N NA
Pocillopora_meandrina_HIv1___Sc0000000:173728-174150 N apa-mir-2050_Exaiptasia_pallida_Baumgarten_et_al._2017_miR-2050;_Nve;_Spis;_Adi
Pocillopora_meandrina_HIv1___Sc0000000:187562-188076 N NA
Pocillopora_meandrina_HIv1___Sc0000000:485730-486254 N NA
Pocillopora_meandrina_HIv1___Sc0000000:496020-496432 N NA
Pocillopora_meandrina_HIv1___Sc0000000:525310-527341 N NA
Pocillopora_meandrina_HIv1___Sc0000000:541262-541723 N NA
# Load bash variables into memory
source .bashvars
awk '$20 == "Y" && $21 != "NA" {print $1"\t"$20"\t"$21}' "${shortstack_dir}/${shortstack_results_file}" | head | column -t
echo ""
echo "------------------------------------------"
echo ""
echo "Number of miRNAs matching miRBase:"
awk '$20 == "Y" && $21 != "NA" {print $1"\t"$20"\t"$21}' "${shortstack_dir}/${shortstack_results_file}" | wc -l
Pocillopora_meandrina_HIv1___Sc0000000:20372416-20372510 Y spi-mir-temp-15_Stylophora_pistillata_Liew_et_al._2014_NA;spi-miR-L-temp-15_Stylophora_pistillata_Praher_et_al._2021_NA
Pocillopora_meandrina_HIv1___Sc0000003:495066-495158 Y Adi-Mir-2036_3p_Acropora_digitifera_Gajigan_&_Conaco_2017_nve-miR-2036-3p;_nve-miR-2036-3p;_spi-miR-temp-30;eca-nve-F-miR-2036_Edwardsiella_carnea_Praher_et_al._2021_Transcriptome-level;spi-mir-temp-30_Stylophora_pistillata_Liew_et_al._2014_Close_match_of_nve-miR-2036.;ami-nve-F-miR-2036-3p_Acropora_millepora_Praher_et_al._2021_NA;spi-nve-F-miR-2036_Stylophora_pistillata_Praher_et_al._2021_NA
Pocillopora_meandrina_HIv1___Sc0000003:10129512-10129606 Y spi-mir-temp-23_Stylophora_pistillata_Liew_et_al._2014_NA
Pocillopora_meandrina_HIv1___Sc0000003:10366033-10366130 Y hsa-miR-100-5p;mmu-miR-100-5p;rno-miR-100-5p;gga-miR-100-5p;aga-miR-100;dre-miR-100-5p;ggo-miR-100;age-miR-100;ppa-miR-100;ppy-miR-100;ptr-miR-100;mml-miR-100-5p;sla-miR-100;lla-miR-100;fru-miR-100;tni-miR-100;xtr-miR-100;mdo-miR-100-5p;ame-miR-100-5p;oan-miR-100-5p;tca-miR-100-5p;bta-miR-100;lgi-miR-100;sko-miR-100;bmo-miR-100;eca-miR-100;ssc-miR-100;bma-miR-100b;aae-miR-100;cqu-miR-100-5p;tgu-miR-100-5p;nvi-miR-100;pma-miR-100a-5p;aca-miR-100;ola-miR-100;sha-miR-100;cgr-miR-100-5p;mse-miR-100;ccr-miR-100;ipu-miR-100;pmi-miR-100-5p;bbe-miR-100-5p;ssa-miR-100a-5p;cpi-miR-100-5p;ami-miR-100-5p;cli-miR-100-5p;pbv-miR-100-5p;chi-miR-100-5p;tch-miR-100-5p;pal-miR-100-5p;tcf-miR-100;abu-miR-100;mze-miR-100;nbr-miR-100;oni-miR-100;pny-miR-100;gmo-miR-100a-5p;pte-miR-100b-5p;xla-miR-100-5p;cpo-miR-100-5p;dno-miR-100-5p;ocu-miR-100-5p;mmr-miR-100;dma-miR-100;cja-miR-100;sbo-miR-100;pha-miR-100;nle-miR-100;oga-miR-100;dqu-miR-100-5p;pca-miR-100-5p;eca-nve-F-miR-100_Edwardsiella_carnea_Praher_et_al._2021_Transcriptome-level;mse-nve-F-miR-100_Metridium_senile_Praher_et_al._2021_Transcriptome-level;sca-nve-F-miR-100_Scolanthus_callimorphus_Praher_et_al._2021_Transcriptome-level;epa-nve-F-miR-100_Exaiptasia_pallida_Praher_et_al._2021_NA;spi-mir-temp-1_Stylophora_pistillata_Liew_et_al._2014_Matches_miR-100_family.;spi-nve-F-miR-100_Stylophora_pistillata_Praher_et_al._2021_NA;apa-mir-100_Exaiptasia_pallida_Baumgarten_et_al._2017_miR-100;_Nve;_Spis;_Adi;avi-miR-temp-100_Anemonia_viridis_Urbarova_et_al._2018_NA;miR-100_Nematostella_vectensis_Moran_et_al._2014_NA
Pocillopora_meandrina_HIv1___Sc0000005:601572-601666 Y nve-miR-2023-3p;Adi-Mir-2023_3p_Acropora_digitifera_Gajigan_&_Conaco_2017_nve-miR-2023-3p;_nve-miR-2023-3p;_spi-miR-temp-4;eca-nve-F-miR-2023_Edwardsiella_carnea_Praher_et_al._2021_Transcriptome-level;mse-nve-F-miR-2023_Metridium_senile_Praher_et_al._2021_Transcriptome-level;sca-nve-miR-2023-3p_Scolanthus_callimorphus_Praher_et_al._2021_Transcriptome-level;epa-nve-F-miR-2023_Exaiptasia_pallida_Praher_et_al._2021_NA;spi-mir-temp-4_Stylophora_pistillata_Liew_et_al._2014_Exact_match_of_nve-miR-2023.;ami-nve-F-miR-2023-3p_Acropora_millepora_Praher_et_al._2021_NA;spi-nve-F-miR-2023_Stylophora_pistillata_Praher_et_al._2021_NA;miR-2023_Nematostella_vectensis_Moran_et_al._2014_NA;apa-mir-2023_Exaiptasia_pallida_Baumgarten_et_al._2017_miR-2023;_Nve;_Spis;_Adi;avi-miR-temp-2023_Anemonia_viridis_Urbarova_et_al._2018_NA
Pocillopora_meandrina_HIv1___Sc0000005:10385497-10385597 Y spi-mir-temp-3_Stylophora_pistillata_Liew_et_al._2014_NA
Pocillopora_meandrina_HIv1___Sc0000009:3894900-3894992 Y Adi-Mir-2030_5p_Acropora_digitifera_Gajigan_&_Conaco_2017_nve-miR-2030-5p;_nve-miR-2030-5p;_spi-miR-temp-40;adi-nve-F-miR-2030_Acropora_digitifera__Praher_et_al._2021_NA;ami-nve-F-miR-2030-5p_Acropora_millepora_Praher_et_al._2021_NA;eca-nve-F-miR-2030_Edwardsiella_carnea_Praher_et_al._2021_Transcriptome-level;spi-mir-temp-40_Stylophora_pistillata_Liew_et_al._2014_Close_match_of_nve-miR-2030;miR-2030_Nematostella_vectensis_Moran_et_al._2014_NA
Pocillopora_meandrina_HIv1___Sc0000014:2366038-2366132 Y ami-nve-F-miR-2025-3p_Acropora_millepora_Praher_et_al._2021_NA;Adi-Mir-2025_3p_Acropora_digitifera_Gajigan_&_Conaco_2017_nve-miR-2025-3p;_nve-miR-2025-3p;adi-nve-F-miR-2025_Acropora_digitifera__Praher_et_al._2021_NA
Pocillopora_meandrina_HIv1___Sc0000021:4351817-4351909 Y spi-mir-temp-34_Stylophora_pistillata_Liew_et_al._2014_NA
------------------------------------------
Number of miRNAs matching miRBase:
9
# Load bash variables into memory
source .bashvars
grep "^>" "${shortstack_dir}/${shortstack_fasta}" | head
>Cluster_19::Pocillopora_meandrina_HIv1___Sc0000000:818026-818120(+)
>Cluster_19.mature::Pocillopora_meandrina_HIv1___Sc0000000:818048-818070(+)
>Cluster_19.star::Pocillopora_meandrina_HIv1___Sc0000000:818078-818100(+)
>Cluster_34::Pocillopora_meandrina_HIv1___Sc0000000:2872018-2872110(+)
>Cluster_34.mature::Pocillopora_meandrina_HIv1___Sc0000000:2872040-2872061(+)
>Cluster_34.star::Pocillopora_meandrina_HIv1___Sc0000000:2872069-2872090(+)
>Cluster_356::Pocillopora_meandrina_HIv1___Sc0000000:20372415-20372510(-)
>Cluster_356.mature::Pocillopora_meandrina_HIv1___Sc0000000:20372468-20372490(-)
>Cluster_356.star::Pocillopora_meandrina_HIv1___Sc0000000:20372435-20372456(-)
>Cluster_751::Pocillopora_meandrina_HIv1___Sc0000001:19145765-19145858(+)
Needed, due to bug in code (GitHub Issue) which incorrectly calculates the starting coordinates in the FastA output. All other files where start/stop coordinates are conveyed are correct.
The incorrect starting coordinates cause an issue in downstream manipulation, because the FastA headers need to match the ShortStack results file.
# Load bash variables into memory
source .bashvars
awk '
/^>/ {
# Split the line into main parts based on "::" delimiter
split($0, main_parts, "::")
# Extract the coordinate part and strand information separately
coordinates_strand = main_parts[2]
split(coordinates_strand, coord_parts, "[:-]")
# Determine if the strand information is present and extract it
strand = ""
if (substr(coordinates_strand, length(coordinates_strand)) ~ /[\(\)\-\+]/) {
strand = substr(coordinates_strand, length(coordinates_strand) - 1)
coordinates_strand = substr(coordinates_strand, 1, length(coordinates_strand) - 2)
split(coordinates_strand, coord_parts, "[:-]")
}
# Increment the starting coordinate by 1
new_start = coord_parts[2] + 1
# Reconstruct the description line with the new starting coordinate
new_description = main_parts[1] "::" coord_parts[1] ":" new_start "-" coord_parts[3] strand
# Print the modified description line
print new_description
# Skip to the next line to process the sequence line
next
}
# For sequence lines, print them as-is
{
print
}
' "${shortstack_dir}/${shortstack_fasta}" \
> "${shortstack_dir}/${shortstack_fixed_fasta}"
diff "${shortstack_dir}/${shortstack_fasta}" \
"${shortstack_dir}/${shortstack_fixed_fasta}" \
| head
1c1
< >Cluster_19::Pocillopora_meandrina_HIv1___Sc0000000:818026-818120(+)
---
> >Cluster_19::Pocillopora_meandrina_HIv1___Sc0000000:818027-818120(+)
3c3
< >Cluster_19.mature::Pocillopora_meandrina_HIv1___Sc0000000:818048-818070(+)
---
> >Cluster_19.mature::Pocillopora_meandrina_HIv1___Sc0000000:818049-818070(+)
5c5
< >Cluster_19.star::Pocillopora_meandrina_HIv1___Sc0000000:818078-818100(+)
samtools
# Load bash variables into memory
source .bashvars
${samtools} faidx "${shortstack_dir}/${shortstack_fixed_fasta}"
head "${shortstack_dir}/${shortstack_fixed_fasta_index}"
Cluster_19::Pocillopora_meandrina_HIv1___Sc0000000:818027-818120(+) 94 69 94 95
Cluster_19.mature::Pocillopora_meandrina_HIv1___Sc0000000:818049-818070(+) 22 240 22 23
Cluster_19.star::Pocillopora_meandrina_HIv1___Sc0000000:818079-818100(+) 22 337 22 23
Cluster_34::Pocillopora_meandrina_HIv1___Sc0000000:2872019-2872110(+) 92 431 92 93
Cluster_34.mature::Pocillopora_meandrina_HIv1___Sc0000000:2872041-2872061(+) 21 602 21 22
Cluster_34.star::Pocillopora_meandrina_HIv1___Sc0000000:2872070-2872090(+) 21 700 21 22
Cluster_356::Pocillopora_meandrina_HIv1___Sc0000000:20372416-20372510(-) 95 796 95 96
Cluster_356.mature::Pocillopora_meandrina_HIv1___Sc0000000:20372469-20372490(-) 22 973 22 23
Cluster_356.star::Pocillopora_meandrina_HIv1___Sc0000000:20372436-20372456(-) 21 1075 21 22
Cluster_751::Pocillopora_meandrina_HIv1___Sc0000001:19145766-19145858(+) 93 1171 93 94
# Load bash variables into memory
source .bashvars
# Make output directory, if it doesn't exist
mkdir --parents "${output_dir_top}"
{
awk '$20 == "Y" && $21 != "NA" {print $2}' "${shortstack_dir}/${shortstack_results_file}" \
| grep --fixed-strings --file - "${shortstack_dir}/${shortstack_fixed_fasta_index}" \
| awk '{print $1}'
} \
> "${output_dir_top}/${regions}"
head "${output_dir_top}/${regions}"
Cluster_356::Pocillopora_meandrina_HIv1___Sc0000000:20372416-20372510(-)
Cluster_356.mature::Pocillopora_meandrina_HIv1___Sc0000000:20372469-20372490(-)
Cluster_356.star::Pocillopora_meandrina_HIv1___Sc0000000:20372436-20372456(-)
Cluster_1108::Pocillopora_meandrina_HIv1___Sc0000003:495066-495158(+)
Cluster_1108.mature::Pocillopora_meandrina_HIv1___Sc0000003:495117-495138(+)
Cluster_1108.star::Pocillopora_meandrina_HIv1___Sc0000003:495086-495108(+)
Cluster_1274::Pocillopora_meandrina_HIv1___Sc0000003:10129512-10129606(-)
Cluster_1274.mature::Pocillopora_meandrina_HIv1___Sc0000003:10129534-10129556(-)
Cluster_1274.star::Pocillopora_meandrina_HIv1___Sc0000003:10129564-10129586(-)
Cluster_1279::Pocillopora_meandrina_HIv1___Sc0000003:10366033-10366130(+)
# Load bash variables into memory
source .bashvars
${samtools} faidx "${shortstack_dir}/${shortstack_fixed_fasta}" \
--region-file "${output_dir_top}/${regions}" \
> "${output_dir_top}/${output_fasta}"
head "${output_dir_top}/${output_fasta}"
echo ""
echo ""
echo ""
echo "Number of FastA sequences:"
grep "^>" --count "${output_dir_top}/${output_fasta}"
>Cluster_356::Pocillopora_meandrina_HIv1___Sc0000000:20372416-20372510(-)
ATATGCAGTGAATTAAGACACTAAACCAGACTAGGCTTCAGCATATTTATTTTGTCAAGT
CTAGGCTGGTTAGTTTTCTCACTTCACACTTCAAT
>Cluster_356.mature::Pocillopora_meandrina_HIv1___Sc0000000:20372469-20372490(-)
CTAAACCAGACTAGGCTTCAGC
>Cluster_356.star::Pocillopora_meandrina_HIv1___Sc0000000:20372436-20372456(-)
TCAAGTCTAGGCTGGTTAGTT
>Cluster_1108::Pocillopora_meandrina_HIv1___Sc0000003:495066-495158(+)
AAACTGCTGTCTGTGGACATGGTGAAAGTCGCTTCAATAAACATTTGACTGTATATTGTA
CGACTCTCATCGTGTCCAAGGCGGCCTCGACCG
Number of FastA sequences:
27