Extract mature miRNAs identified with matches to miRBase by ShortStack in 08.2-Peve-sRNAseq-ShortStack-31bp-fastp-merged.Rmd to FastA.
This allows usage of Bash variables across R Markdown chunks.
{
echo "#### Assign Variables ####"
echo ""
echo "# Trimmed FastQ naming pattern"
echo "# Data directories"
echo 'export deep_dive_dir=/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive'
echo 'export shortstack_dir="${deep_dive_dir}/E-Peve/output/08.2-Peve-sRNAseq-ShortStack-31bp-fastp-merged/ShortStack_out"'
echo 'export output_dir_top=${deep_dive_dir}/E-Peve/output/08.2.1-Peve-sRNAseq-ShortStack-FastA-extraction'
echo ""
echo "# Input/Output files"
echo 'export output_fasta="mature-miRBase-matches.fasta"'
echo 'export shortstack_fasta="mir.fasta"'
echo 'export shortstack_fasta_index="mir.fasta.fai"'
echo 'export shortstack_fixed_fasta="mir-coords-fixed.fasta"'
echo 'export shortstack_fixed_fasta_index="mir-coords-fixed.fasta.fai"'
echo 'export shortstack_results_file="Results.txt"'
echo 'export regions="mature-miRBase-regions.txt"'
echo ""
echo "# Set number of CPUs to use"
echo 'export threads=40'
echo ""
echo "# Programs"
echo 'export samtools=/home/shared/samtools-1.12/samtools'
} > .bashvars
cat .bashvars
#### Assign Variables ####
# Trimmed FastQ naming pattern
# Data directories
export deep_dive_dir=/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive
export shortstack_dir="${deep_dive_dir}/E-Peve/output/08.2-Peve-sRNAseq-ShortStack-31bp-fastp-merged/ShortStack_out"
export output_dir_top=${deep_dive_dir}/E-Peve/output/08.2.1-Peve-sRNAseq-ShortStack-FastA-extraction
# Input/Output files
export output_fasta="mature-miRBase-matches.fasta"
export shortstack_fasta="mir.fasta"
export shortstack_fasta_index="mir.fasta.fai"
export shortstack_fixed_fasta="mir-coords-fixed.fasta"
export shortstack_fixed_fasta_index="mir-coords-fixed.fasta.fai"
export shortstack_results_file="Results.txt"
export regions="mature-miRBase-regions.txt"
# Set number of CPUs to use
export threads=40
# Programs
export samtools=/home/shared/samtools-1.12/samtools
Results.txt
# Load bash variables into memory
source .bashvars
head "${shortstack_dir}/${shortstack_results_file}" | column -t
Locus Name Chrom Start End Length Reads DistinctSequences FracTop Strand MajorRNA MajorRNAReads Short Long 21 22 23 24 DicerCall MIRNA known_miRNAs
Porites_evermani_scaffold_1:45711-46131 Cluster_1 Porites_evermani_scaffold_1 45711 46131 421 88 38 1.0 + CAGUAGAGGUGGCCAAGAAUCAGU 8 24 27 9 8 9 11 N N NA
Porites_evermani_scaffold_1:201507-201931 Cluster_2 Porites_evermani_scaffold_1 201507 201931 425 58 14 0.034482758620689655 - UGUACUUCUGAUUAAACGAACCAGACAUCGC 12 0 50 0 0 0 8 N N NA
Porites_evermani_scaffold_1:313446-313846 Cluster_3 Porites_evermani_scaffold_1 313446 313846 401 50 27 0.0 - CUGACGUUUUAAGCUCAAUAGU 13 10 15 1 17 3 4 N N NA
Porites_evermani_scaffold_1:406146-406734 Cluster_4 Porites_evermani_scaffold_1 406146 406734 589 175 61 0.06285714285714286 - UGAGUGUAUUCUUGAACUGUUUUCCAAC 39 1 159 2 0 5 8 N N NA
Porites_evermani_scaffold_1:409839-410269 Cluster_5 Porites_evermani_scaffold_1 409839 410269 431 169 43 0.005917159763313609 - UGGAACUCCGAUUUAGAACUUGCAAACUUU 61 0 161 1 3 0 4 N N NA
Porites_evermani_scaffold_1:465244-465668 Cluster_6 Porites_evermani_scaffold_1 465244 465668 425 169 49 0.0 - AAGUUGCUCUGAAGAUUAUGU 39 34 52 48 8 20 7 N N NA
Porites_evermani_scaffold_1:468473-468950 Cluster_7 Porites_evermani_scaffold_1 468473 468950 478 91900 807 0.0 - AGCACUGAUGACUGUUCAGUUUUUCUGAAUU 68534 2227 88188 115 138 153 1079 N N NA
Porites_evermani_scaffold_1:476827-477250 Cluster_8 Porites_evermani_scaffold_1 476827 477250 424 116 37 0.0 - CGUGUCUUCGUAAUCGUCUCGUAC 14 33 38 0 12 15 18 N N NA
Porites_evermani_scaffold_1:486441-486868 Cluster_9 Porites_evermani_scaffold_1 486441 486868 428 57 11 0.07017543859649122 - AUAUUGACGAAUCCUGGCCUAGUGAACC 26 0 53 0 0 4 0 N N NA
Column 1: Region of miRNA match
Column 20: ShortStack miRNA? Y/N
Column 21: Match to miRBase? NA or miRBase match
# Load bash variables into memory
source .bashvars
awk '{print $1"\t"$20"\t"$21}' "${shortstack_dir}/${shortstack_results_file}" | head | column -t
Locus MIRNA known_miRNAs
Porites_evermani_scaffold_1:45711-46131 N NA
Porites_evermani_scaffold_1:201507-201931 N NA
Porites_evermani_scaffold_1:313446-313846 N NA
Porites_evermani_scaffold_1:406146-406734 N NA
Porites_evermani_scaffold_1:409839-410269 N NA
Porites_evermani_scaffold_1:465244-465668 N NA
Porites_evermani_scaffold_1:468473-468950 N NA
Porites_evermani_scaffold_1:476827-477250 N NA
Porites_evermani_scaffold_1:486441-486868 N NA
# Load bash variables into memory
source .bashvars
awk '$20 == "Y" && $21 != "NA" {print $1"\t"$20"\t"$21}' "${shortstack_dir}/${shortstack_results_file}" | head | column -t
echo ""
echo "------------------------------------------"
echo ""
echo "Number of miRNAs matching miRBase:"
awk '$20 == "Y" && $21 != "NA" {print $1"\t"$20"\t"$21}' "${shortstack_dir}/${shortstack_results_file}" | wc -l
Porites_evermani_scaffold_49:151587-151681 Y ami-nve-F-miR-100-5p_Acropora_millepora_Praher_et_al._2021_NA;Adi-Mir-100_5p_Acropora_digitifera_Gajigan_&_Conaco_2017_mir-100;_spi-miR-temp-1;adi-nve-F-miR-100_Acropora_digitifera__Praher_et_al._2021_NA
Porites_evermani_scaffold_430:205865-205960 Y spi-mir-temp-30_Stylophora_pistillata_Liew_et_al._2014_Close_match_of_nve-miR-2036.;spi-nve-F-miR-2036_Stylophora_pistillata_Praher_et_al._2021_NA;ami-nve-F-miR-2036-3p_Acropora_millepora_Praher_et_al._2021_NA;Adi-Mir-2036_3p_Acropora_digitifera_Gajigan_&_Conaco_2017_nve-miR-2036-3p;_nve-miR-2036-3p;_spi-miR-temp-30;eca-nve-F-miR-2036_Edwardsiella_carnea_Praher_et_al._2021_Transcriptome-level
Porites_evermani_scaffold_502:58948-59038 Y ami-nve-F-miR-2030-5p_Acropora_millepora_Praher_et_al._2021_NA;Adi-Mir-2030_5p_Acropora_digitifera_Gajigan_&_Conaco_2017_nve-miR-2030-5p;_nve-miR-2030-5p;_spi-miR-temp-40;adi-nve-F-miR-2030_Acropora_digitifera__Praher_et_al._2021_NA;spi-mir-temp-40_Stylophora_pistillata_Liew_et_al._2014_Close_match_of_nve-miR-2030;eca-nve-F-miR-2030_Edwardsiella_carnea_Praher_et_al._2021_Transcriptome-level;miR-2030_Nematostella_vectensis_Moran_et_al._2014_NA
Porites_evermani_scaffold_594:158176-158270 Y nve-miR-2023-3p;spi-mir-temp-4_Stylophora_pistillata_Liew_et_al._2014_Exact_match_of_nve-miR-2023.;apa-mir-2023_Exaiptasia_pallida_Baumgarten_et_al._2017_miR-2023;_Nve;_Spis;_Adi;mse-nve-F-miR-2023_Metridium_senile_Praher_et_al._2021_Transcriptome-level;sca-nve-miR-2023-3p_Scolanthus_callimorphus_Praher_et_al._2021_Transcriptome-level;eca-nve-F-miR-2023_Edwardsiella_carnea_Praher_et_al._2021_Transcriptome-level;ami-nve-F-miR-2023-3p_Acropora_millepora_Praher_et_al._2021_NA;Adi-Mir-2023_3p_Acropora_digitifera_Gajigan_&_Conaco_2017_nve-miR-2023-3p;_nve-miR-2023-3p;_spi-miR-temp-4;epa-nve-F-miR-2023_Exaiptasia_pallida_Praher_et_al._2021_NA;miR-2023_Nematostella_vectensis_Moran_et_al._2014_NA;spi-nve-F-miR-2023_Stylophora_pistillata_Praher_et_al._2021_NA;avi-miR-temp-2023_Anemonia_viridis_Urbarova_et_al._2018_NA
Porites_evermani_scaffold_910:99233-99322 Y adi-miR-P-novel-6-3p_Acropora_digitifera__Praher_et_al._2021_NA
Porites_evermani_scaffold_910:118720-118809 Y adi-miR-P-novel-6-3p_Acropora_digitifera__Praher_et_al._2021_NA;spi-mir-temp-14_Stylophora_pistillata_Liew_et_al._2014_NA;ami-spi-miR-L-temp-14-5p_Acropora_millepora_Praher_et_al._2021_NA
Porites_evermani_scaffold_910:139331-139420 Y ami-spi-miR-L-temp-14-5p_Acropora_millepora_Praher_et_al._2021_NA
Porites_evermani_scaffold_1503:47558-47651 Y spi-mir-temp-15_Stylophora_pistillata_Liew_et_al._2014_NA;spi-miR-L-temp-15_Stylophora_pistillata_Praher_et_al._2021_NA
Porites_evermani_scaffold_3072:29428-29521 Y ami-nve-F-miR-2025-3p_Acropora_millepora_Praher_et_al._2021_NA;Adi-Mir-2025_3p_Acropora_digitifera_Gajigan_&_Conaco_2017_nve-miR-2025-3p;_nve-miR-2025-3p;adi-nve-F-miR-2025_Acropora_digitifera__Praher_et_al._2021_NA
------------------------------------------
Number of miRNAs matching miRBase:
9
# Load bash variables into memory
source .bashvars
grep "^>" "${shortstack_dir}/${shortstack_fasta}" | head
>Cluster_29::Porites_evermani_scaffold_1:1404249-1404342(-)
>Cluster_29.mature::Porites_evermani_scaffold_1:1404271-1404293(-)
>Cluster_29.star::Porites_evermani_scaffold_1:1404300-1404322(-)
>Cluster_578::Porites_evermani_scaffold_16:383385-383478(-)
>Cluster_578.mature::Porites_evermani_scaffold_16:383436-383458(-)
>Cluster_578.star::Porites_evermani_scaffold_16:383405-383427(-)
>Cluster_786::Porites_evermani_scaffold_26:382549-382645(-)
>Cluster_786.mature::Porites_evermani_scaffold_26:382571-382593(-)
>Cluster_786.star::Porites_evermani_scaffold_26:382602-382625(-)
>Cluster_1125::Porites_evermani_scaffold_47:475971-476066(-)
Needed, due to bug in code (GitHub Issue) which incorrectly calculates the starting coordinates in the FastA output. All other files where start/stop coordinates are conveyed are correct.
The incorrect starting coordinates cause an issue in downstream manipulation, because the FastA headers need to match the ShortStack results file.
# Load bash variables into memory
source .bashvars
awk '
/^>/ {
# Split the line into main parts based on "::" delimiter
split($0, main_parts, "::")
# Extract the coordinate part and strand information separately
coordinates_strand = main_parts[2]
split(coordinates_strand, coord_parts, "[:-]")
# Determine if the strand information is present and extract it
strand = ""
if (substr(coordinates_strand, length(coordinates_strand)) ~ /[\(\)\-\+]/) {
strand = substr(coordinates_strand, length(coordinates_strand) - 1)
coordinates_strand = substr(coordinates_strand, 1, length(coordinates_strand) - 2)
split(coordinates_strand, coord_parts, "[:-]")
}
# Increment the starting coordinate by 1
new_start = coord_parts[2] + 1
# Reconstruct the description line with the new starting coordinate
new_description = main_parts[1] "::" coord_parts[1] ":" new_start "-" coord_parts[3] strand
# Print the modified description line
print new_description
# Skip to the next line to process the sequence line
next
}
# For sequence lines, print them as-is
{
print
}
' "${shortstack_dir}/${shortstack_fasta}" \
> "${shortstack_dir}/${shortstack_fixed_fasta}"
diff "${shortstack_dir}/${shortstack_fasta}" \
"${shortstack_dir}/${shortstack_fixed_fasta}" \
| head
1c1
< >Cluster_29::Porites_evermani_scaffold_1:1404249-1404342(-)
---
> >Cluster_29::Porites_evermani_scaffold_1:1404250-1404342(-)
3c3
< >Cluster_29.mature::Porites_evermani_scaffold_1:1404271-1404293(-)
---
> >Cluster_29.mature::Porites_evermani_scaffold_1:1404272-1404293(-)
5c5
< >Cluster_29.star::Porites_evermani_scaffold_1:1404300-1404322(-)
samtools
# Load bash variables into memory
source .bashvars
${samtools} faidx "${shortstack_dir}/${shortstack_fixed_fasta}"
head "${shortstack_dir}/${shortstack_fixed_fasta_index}"
Cluster_29::Porites_evermani_scaffold_1:1404250-1404342(-) 93 60 93 94
Cluster_29.mature::Porites_evermani_scaffold_1:1404272-1404293(-) 22 221 22 23
Cluster_29.star::Porites_evermani_scaffold_1:1404301-1404322(-) 22 309 22 23
Cluster_578::Porites_evermani_scaffold_16:383386-383478(-) 93 392 93 94
Cluster_578.mature::Porites_evermani_scaffold_16:383437-383458(-) 22 553 22 23
Cluster_578.star::Porites_evermani_scaffold_16:383406-383427(-) 22 641 22 23
Cluster_786::Porites_evermani_scaffold_26:382550-382645(-) 96 724 96 97
Cluster_786.mature::Porites_evermani_scaffold_26:382572-382593(-) 22 888 22 23
Cluster_786.star::Porites_evermani_scaffold_26:382603-382625(-) 23 976 23 24
Cluster_1125::Porites_evermani_scaffold_47:475972-476066(-) 95 1061 95 96
# Load bash variables into memory
source .bashvars
# Make output directory, if it doesn't exist
mkdir --parents "${output_dir_top}"
{
awk '$20 == "Y" && $21 != "NA" {print $2}' "${shortstack_dir}/${shortstack_results_file}" \
| grep --fixed-strings --file - "${shortstack_dir}/${shortstack_fixed_fasta_index}" \
| awk '{print $1}'
} \
> "${output_dir_top}/${regions}"
head "${output_dir_top}/${regions}"
Cluster_1153::Porites_evermani_scaffold_49:151587-151681(-)
Cluster_1153.mature::Porites_evermani_scaffold_49:151640-151661(-)
Cluster_1153.star::Porites_evermani_scaffold_49:151607-151628(-)
Cluster_5540::Porites_evermani_scaffold_430:205865-205960(-)
Cluster_5540.mature::Porites_evermani_scaffold_430:205887-205909(-)
Cluster_5540.star::Porites_evermani_scaffold_430:205917-205940(-)
Cluster_6211::Porites_evermani_scaffold_502:58948-59038(-)
Cluster_6211.mature::Porites_evermani_scaffold_502:58997-59018(-)
Cluster_6211.star::Porites_evermani_scaffold_502:58968-58989(-)
Cluster_6875::Porites_evermani_scaffold_594:158176-158270(+)
# Load bash variables into memory
source .bashvars
${samtools} faidx "${shortstack_dir}/${shortstack_fixed_fasta}" \
--region-file "${output_dir_top}/${regions}" \
> "${output_dir_top}/${output_fasta}"
head "${output_dir_top}/${output_fasta}"
echo ""
echo ""
echo ""
echo "Number of FastA sequences:"
grep "^>" --count "${output_dir_top}/${output_fasta}"
>Cluster_1153::Porites_evermani_scaffold_49:151587-151681(-)
AATTCTAGCAGCTCATGCGATCCCGTAGATCCGAACTTGTGGGTATTTTCTCCACAGGTT
GGGCTCTACGGTCATATTTGCTGTAATATAACATG
>Cluster_1153.mature::Porites_evermani_scaffold_49:151640-151661(-)
TCCCGTAGATCCGAACTTGTGG
>Cluster_1153.star::Porites_evermani_scaffold_49:151607-151628(-)
ACAGGTTGGGCTCTACGGTCAT
>Cluster_5540::Porites_evermani_scaffold_430:205865-205960(-)
TGAAGTGTTGTTTCTAGAGGCGGTGAAAGTCGTCTCAATACACAGTTGATGTATATTGTA
CGACTCTCATCGTGTTTGTGACAATCTCTTATTTCA
Number of FastA sequences:
27