This notebook performs a simple NCBI BLASTn (Altschul et al. 1990) against an miRNA database to attempt to identify miRNA in P.meandrina sRNAseq:
Utilizes a modified version, which includes cnidarian miRNA culled from literature by Jill Ahsley.
Relies on the following software:
fastx_collapser
: Collapses duplicate sequences in FastA/Q into single sequence.This allows usage of Bash variables across R Markdown chunks.
{
echo "#### Assign Variables ####"
echo ""
echo "# Trimmed FastQ naming pattern"
echo "export trimmed_fastqs_pattern='*fastp-adapters-polyG-31bp-merged.fq.gz'"
echo "# Data directories"
echo 'export deep_dive_dir=/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive'
echo 'export deep_dive_data_dir="${deep_dive_dir}/data"'
echo 'export output_dir_top=${deep_dive_dir}/F-Pmea/output/10.1-Pmea-sRNAseq-BLASTn-31bp-fastp-merged-cnidarian_miRBase'
echo 'export trimmed_fastqs_dir="${deep_dive_dir}/F-Pmea/output/08.2-Pmea-sRNAseq-trimming-31bp-fastp-merged/trimmed-reads"'
echo 'export blast_dbs_dir="${deep_dive_dir}/data/blast_dbs"'
echo ""
echo "# Input/Output files"
echo 'export collapsed_reads_fasta="collapsed-reads-all.fasta"'
echo 'export concatenated_trimmed_reads_fastq="concatenated-trimmed-reads-all.fastq.gz"'
echo 'export mirbase_mature_fasta_name="cnidarian-mirbase-mature-v22.1.fasta"'
echo 'export mirbase_mature_fasta_no_U="cnidarian-mirbase-mature-v22.1-no_U.fa"'
echo 'export mirgene_mature_fasta_name="mirgene-mature-all-v2.1.fa"'
echo 'export mirgene_mature_fasta_no_U="mirgene-mature-all-v2.1-no_U.fa"'
echo ""
echo "# External data URLs"
echo 'export mirgenedb_fasta_url="https://www.mirgenedb.org/fasta/ALL?mat=1"'
echo ""
echo "# Paths to programs"
echo 'export ncbi_blast_dir="/home/shared/ncbi-blast-2.15.0+/bin/"'
echo 'export ncbi_blastn="${ncbi_blast_dir}/blastn"'
echo 'export ncbi_makeblast_db="${ncbi_blast_dir}/makeblastdb"'
echo 'export fastx_collapser="/home/shared/fastx_toolkit_0.0.13_binaries_Linux_2.6_amd64/bin/fastx_collapser"'
echo "# Set number of CPUs to use"
echo 'export threads=46'
echo ""
echo "# Initialize arrays"
echo 'export trimmed_fastqs_array=()'
} > .bashvars
cat .bashvars
#### Assign Variables ####
# Trimmed FastQ naming pattern
export trimmed_fastqs_pattern='*fastp-adapters-polyG-31bp-merged.fq.gz'
# Data directories
export deep_dive_dir=/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive
export deep_dive_data_dir="${deep_dive_dir}/data"
export output_dir_top=${deep_dive_dir}/F-Pmea/output/10.1-Pmea-sRNAseq-BLASTn-31bp-fastp-merged-cnidarian_miRBase
export trimmed_fastqs_dir="${deep_dive_dir}/F-Pmea/output/08.2-Pmea-sRNAseq-trimming-31bp-fastp-merged/trimmed-reads"
export blast_dbs_dir="${deep_dive_dir}/data/blast_dbs"
# Input/Output files
export collapsed_reads_fasta="collapsed-reads-all.fasta"
export concatenated_trimmed_reads_fastq="concatenated-trimmed-reads-all.fastq.gz"
export mirbase_mature_fasta_name="cnidarian-mirbase-mature-v22.1.fasta"
export mirbase_mature_fasta_no_U="cnidarian-mirbase-mature-v22.1-no_U.fa"
export mirgene_mature_fasta_name="mirgene-mature-all-v2.1.fa"
export mirgene_mature_fasta_no_U="mirgene-mature-all-v2.1-no_U.fa"
# External data URLs
export mirgenedb_fasta_url="https://www.mirgenedb.org/fasta/ALL?mat=1"
# Paths to programs
export ncbi_blast_dir="/home/shared/ncbi-blast-2.15.0+/bin/"
export ncbi_blastn="${ncbi_blast_dir}/blastn"
export ncbi_makeblast_db="${ncbi_blast_dir}/makeblastdb"
export fastx_collapser="/home/shared/fastx_toolkit_0.0.13_binaries_Linux_2.6_amd64/bin/fastx_collapser"
# Set number of CPUs to use
export threads=46
# Initialize arrays
export trimmed_fastqs_array=()
# Load bash variables into memory
source .bashvars
# Download MirGeneDB, if it doesn't exist
if [ ! -f "${deep_dive_data_dir}/${mirgene_mature_fasta_name}" ]; then
wget \
--no-check-certificate \
--continue \
--no-host-directories \
--no-directories \
--no-parent \
--quiet \
--execute robots=off \
--output-document ${deep_dive_data_dir}/${mirgene_mature_fasta_name} \
${mirgenedb_fasta_url}
fi
ls -lh ${deep_dive_data_dir}
total 24M
drwxr-xr-x 2 sam sam 4.0K Apr 22 11:41 blast_dbs
-rw-rw-r-- 1 sam sam 3.8M Apr 2 06:51 cnidarian-mirbase-mature-v22.1.fasta
-rw-r--r-- 1 sam sam 3.8M Apr 19 12:22 cnidarian-mirbase-mature-v22.1-no_spaces.fa
-rw-r--r-- 1 sam sam 3.8M Apr 22 12:56 cnidarian-mirbase-mature-v22.1-no_U.fa
-rw-r--r-- 1 sam sam 44K Apr 2 06:49 cnidarian_miRNAs.fasta
-rw-r--r-- 1 sam sam 3.7M Feb 16 12:36 mirbase-mature-v22.1.fa
-rw-r--r-- 1 sam sam 3.7M Dec 4 11:05 mirbase-mature-v22.1-no_spaces.fa
-rw-r--r-- 1 sam sam 3.7M Apr 22 10:12 mirbase-mature-v22.1-no_U.fa
-rw-r--r-- 1 sam sam 726K Nov 7 14:36 mirgene-mature-all-v2.1.fa
-rw-r--r-- 1 sam sam 726K Apr 22 12:56 mirgene-mature-all-v2.1-no_U.fa
# Load bash variables into memory
source .bashvars
head "${deep_dive_data_dir}"/[cm][ni]*.fa*
==> /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/data/cnidarian-mirbase-mature-v22.1.fasta <==
>cel-let-7-5p MIMAT0000001 Caenorhabditis elegans let-7-5p
UGAGGUAGUAGGUUGUAUAGUU
>cel-let-7-3p MIMAT0015091 Caenorhabditis elegans let-7-3p
CUAUGCAAUUUUCUACCUUACC
>cel-lin-4-5p MIMAT0000002 Caenorhabditis elegans lin-4-5p
UCCCUGAGACCUCAAGUGUGA
>cel-lin-4-3p MIMAT0015092 Caenorhabditis elegans lin-4-3p
ACACCUGGGCUCUCCGGGUACC
>cel-miR-1-5p MIMAT0020301 Caenorhabditis elegans miR-1-5p
CAUACUUCCUUACAUGCCCAUA
==> /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/data/cnidarian-mirbase-mature-v22.1-no_spaces.fa <==
>cel-let-7-5p_MIMAT0000001_Caenorhabditis_elegans_let-7-5p
UGAGGUAGUAGGUUGUAUAGUU
>cel-let-7-3p_MIMAT0015091_Caenorhabditis_elegans_let-7-3p
CUAUGCAAUUUUCUACCUUACC
>cel-lin-4-5p_MIMAT0000002_Caenorhabditis_elegans_lin-4-5p
UCCCUGAGACCUCAAGUGUGA
>cel-lin-4-3p_MIMAT0015092_Caenorhabditis_elegans_lin-4-3p
ACACCUGGGCUCUCCGGGUACC
>cel-miR-1-5p_MIMAT0020301_Caenorhabditis_elegans_miR-1-5p
CAUACUUCCUUACAUGCCCAUA
==> /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/data/cnidarian-mirbase-mature-v22.1-no_U.fa <==
>cel-let-7-5p MIMAT0000001 Caenorhabditis elegans let-7-5p
TGAGGTAGTAGGTTGTATAGTT
>cel-let-7-3p MIMAT0015091 Caenorhabditis elegans let-7-3p
CTATGCAATTTTCTACCTTACC
>cel-lin-4-5p MIMAT0000002 Caenorhabditis elegans lin-4-5p
TCCCTGAGACCTCAAGTGTGA
>cel-lin-4-3p MIMAT0015092 Caenorhabditis elegans lin-4-3p
ACACCTGGGCTCTCCGGGTACC
>cel-miR-1-5p MIMAT0020301 Caenorhabditis elegans miR-1-5p
CATACTTCCTTACATGCCCATA
==> /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/data/cnidarian_miRNAs.fasta <==
>spi-mir-temp-1_Stylophora_pistillata_Liew_et_al._2014_Matches_miR-100_family.
ACCCGUAGAUCCGAACUUGUGG
>spi-mir-temp-2_Stylophora_pistillata_Liew_et_al._2014_NA
UAUCGAAUCCGUCAAAAAGAGA
>spi-mir-temp-3_Stylophora_pistillata_Liew_et_al._2014_NA
UCAGGGAUUGUGGUGAGUUAGUU
>spi-mir-temp-4_Stylophora_pistillata_Liew_et_al._2014_Exact_match_of_nve-miR-2023.
AAAGAAGUACAAGUGGUAGGG
>spi-mir-temp-5_Stylophora_pistillata_Liew_et_al._2014_NA
GAGGUCCGGAUGGUUGA
==> /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/data/mirbase-mature-v22.1.fa <==
>cel-let-7-5p MIMAT0000001 Caenorhabditis elegans let-7-5p
UGAGGUAGUAGGUUGUAUAGUU
>cel-let-7-3p MIMAT0015091 Caenorhabditis elegans let-7-3p
CUAUGCAAUUUUCUACCUUACC
>cel-lin-4-5p MIMAT0000002 Caenorhabditis elegans lin-4-5p
UCCCUGAGACCUCAAGUGUGA
>cel-lin-4-3p MIMAT0015092 Caenorhabditis elegans lin-4-3p
ACACCUGGGCUCUCCGGGUACC
>cel-miR-1-5p MIMAT0020301 Caenorhabditis elegans miR-1-5p
CAUACUUCCUUACAUGCCCAUA
==> /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/data/mirbase-mature-v22.1-no_spaces.fa <==
>cel-let-7-5p_MIMAT0000001_Caenorhabditis_elegans_let-7-5p
UGAGGUAGUAGGUUGUAUAGUU
>cel-let-7-3p_MIMAT0015091_Caenorhabditis_elegans_let-7-3p
CUAUGCAAUUUUCUACCUUACC
>cel-lin-4-5p_MIMAT0000002_Caenorhabditis_elegans_lin-4-5p
UCCCUGAGACCUCAAGUGUGA
>cel-lin-4-3p_MIMAT0015092_Caenorhabditis_elegans_lin-4-3p
ACACCUGGGCUCUCCGGGUACC
>cel-miR-1-5p_MIMAT0020301_Caenorhabditis_elegans_miR-1-5p
CAUACUUCCUUACAUGCCCAUA
==> /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/data/mirbase-mature-v22.1-no_U.fa <==
>cel-let-7-5p MIMAT0000001 Caenorhabditis elegans let-7-5p
TGAGGTAGTAGGTTGTATAGTT
>cel-let-7-3p MIMAT0015091 Caenorhabditis elegans let-7-3p
CTATGCAATTTTCTACCTTACC
>cel-lin-4-5p MIMAT0000002 Caenorhabditis elegans lin-4-5p
TCCCTGAGACCTCAAGTGTGA
>cel-lin-4-3p MIMAT0015092 Caenorhabditis elegans lin-4-3p
ACACCTGGGCTCTCCGGGTACC
>cel-miR-1-5p MIMAT0020301 Caenorhabditis elegans miR-1-5p
CATACTTCCTTACATGCCCATA
==> /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/data/mirgene-mature-all-v2.1.fa <==
>Aae-Bantam_3p
UGAGAUCAUUUUGAAAGCUGAUU
>Bge-Bantam_3p
UGAGAUCAUUGUGAAAGCUGAUU
>Bpl-Bantam_3p
UGAGAUCAUUGUGAAAACUGAU
>Cgi-Bantam_3p
UGAGAUCAUUGUGAAAACUGAUU
>Cte-Bantam_3p
UGAGAUCAUUGUGAAAACUAAUC
==> /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/data/mirgene-mature-all-v2.1-no_U.fa <==
>Aae-Bantam_3p
TGAGATCATTTTGAAAGCTGATT
>Bge-Bantam_3p
TGAGATCATTGTGAAAGCTGATT
>Bpl-Bantam_3p
TGAGATCATTGTGAAAACTGAT
>Cgi-Bantam_3p
TGAGATCATTGTGAAAACTGATT
>Cte-Bantam_3p
TGAGATCATTGTGAAAACTAATC
U
to T
in miRNA FastAsThis is needed because the sRNAseq sequences do not have uracils (U
) - they have thymines (T
).
# Load bash variables into memory
source .bashvars
# Convert miRBase FastA
sed '/^[^>]/s/U/T/g' "${deep_dive_data_dir}/${mirbase_mature_fasta_name}" \
> "${deep_dive_data_dir}/${mirbase_mature_fasta_no_U}"
# Convert MirGene FastA
sed '/^[^>]/s/U/T/g' "${deep_dive_data_dir}/${mirgene_mature_fasta_name}" \
> "${deep_dive_data_dir}/${mirgene_mature_fasta_no_U}"
head "${deep_dive_data_dir}"/[cm][ni]*U.fa
==> /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/data/cnidarian-mirbase-mature-v22.1-no_U.fa <==
>cel-let-7-5p MIMAT0000001 Caenorhabditis elegans let-7-5p
TGAGGTAGTAGGTTGTATAGTT
>cel-let-7-3p MIMAT0015091 Caenorhabditis elegans let-7-3p
CTATGCAATTTTCTACCTTACC
>cel-lin-4-5p MIMAT0000002 Caenorhabditis elegans lin-4-5p
TCCCTGAGACCTCAAGTGTGA
>cel-lin-4-3p MIMAT0015092 Caenorhabditis elegans lin-4-3p
ACACCTGGGCTCTCCGGGTACC
>cel-miR-1-5p MIMAT0020301 Caenorhabditis elegans miR-1-5p
CATACTTCCTTACATGCCCATA
==> /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/data/mirbase-mature-v22.1-no_U.fa <==
>cel-let-7-5p MIMAT0000001 Caenorhabditis elegans let-7-5p
TGAGGTAGTAGGTTGTATAGTT
>cel-let-7-3p MIMAT0015091 Caenorhabditis elegans let-7-3p
CTATGCAATTTTCTACCTTACC
>cel-lin-4-5p MIMAT0000002 Caenorhabditis elegans lin-4-5p
TCCCTGAGACCTCAAGTGTGA
>cel-lin-4-3p MIMAT0015092 Caenorhabditis elegans lin-4-3p
ACACCTGGGCTCTCCGGGTACC
>cel-miR-1-5p MIMAT0020301 Caenorhabditis elegans miR-1-5p
CATACTTCCTTACATGCCCATA
==> /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/data/mirgene-mature-all-v2.1-no_U.fa <==
>Aae-Bantam_3p
TGAGATCATTTTGAAAGCTGATT
>Bge-Bantam_3p
TGAGATCATTGTGAAAGCTGATT
>Bpl-Bantam_3p
TGAGATCATTGTGAAAACTGAT
>Cgi-Bantam_3p
TGAGATCATTGTGAAAACTGATT
>Cte-Bantam_3p
TGAGATCATTGTGAAAACTAATC
# Load bash variables into memory
source .bashvars
# MirGene BLAST DB
## Make sure output directory exists
if [ ! -d "${blast_dbs_dir}" ]; then
mkdir --parents "${blast_dbs_dir}"
fi
## Check for pre-exising database
if [ ! -f "${blast_dbs_dir}/${mirgene_mature_fasta_no_U%.*}.blastdb.log" ]; then
${ncbi_makeblast_db} \
-in ${deep_dive_data_dir}/${mirgene_mature_fasta_no_U} \
-title ${mirgene_mature_fasta_no_U%.*} \
-dbtype nucl \
-out ${blast_dbs_dir}/${mirgene_mature_fasta_no_U%.*} \
-logfile ${blast_dbs_dir}/${mirgene_mature_fasta_no_U%.*}.blastdb.log
fi
# miRBase BLAST DB
## Make sure output directory exists
if [ ! -d "${blast_dbs_dir}" ]; then
mkdir --parents "${blast_dbs_dir}"
fi
## Check for pre-exising database
if [ ! -f "${blast_dbs_dir}/${mirbase_mature_fasta_no_U%.*}.blastdb.log" ]; then
${ncbi_makeblast_db} \
-in ${deep_dive_data_dir}/${mirbase_mature_fasta_no_U} \
-title ${mirbase_mature_fasta_no_U%.*} \
-dbtype nucl \
-out ${blast_dbs_dir}/${mirbase_mature_fasta_no_U%.*} \
-logfile ${blast_dbs_dir}/${mirbase_mature_fasta_no_U%.*}.blastdb.log
fi
# Load bash variables into memory
source .bashvars
# Make output directory, if it doens't exist
if [ ! -d "${output_dir_top}" ]; then
mkdir --parents "${output_dir_top}"
fi
# Check for existence of concatenated FastA before running
if [ ! -f "${output_dir_top}/${concatenated_trimmed_reads_fastq}" ]; then
cat ${trimmed_fastqs_dir}/${trimmed_fastqs_pattern} \
> "${output_dir_top}/${concatenated_trimmed_reads_fastq}"
fi
ls -lh "${output_dir_top}/${concatenated_trimmed_reads_fastq}"
-rw-r--r-- 1 sam sam 1.1G Apr 22 11:03 /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/F-Pmea/output/10.1-Pmea-sRNAseq-BLASTn-31bp-fastp-merged-cnidarian_miRBase/concatenated-trimmed-reads-all.fastq.gz
Uses fastx_collapser
to collapse to unique reads.
Requires undocumented quality setting. Have selected 30
as cuttoff: -Q30
.
# Load bash variables into memory
source .bashvars
# Check for existence of collapsed FastA before running
time \
if [ ! -f "${output_dir_top}/${collapsed_reads_fasta}" ]; then
zcat ${output_dir_top}/${concatenated_trimmed_reads_fastq} \
| ${fastx_collapser} \
-Q30 \
-o "${output_dir_top}/${collapsed_reads_fasta}"
fi
head "${output_dir_top}/${collapsed_reads_fasta}"
echo ""
echo "-----------------------------------------------------"
echo ""
echo "Number of collapsed reads:"
grep --count "^>" "${output_dir_top}/${collapsed_reads_fasta}"
real 0m0.000s
user 0m0.000s
sys 0m0.000s
>1-10257663
TGAAAATCTTTGCTCTGAAGTGGAA
>2-2952210
CTCGGGCTGAGACTTGAAGCG
>3-1527059
GCACTGGTGGTTCAGTGGTAGAATTCTC
>4-1205244
GCACTGGTGGTTCAGTGGTAGAATTCTCGCC
>5-1189262
CTCGGGCTGAGACTTGAAGCA
-----------------------------------------------------
Number of collapsed reads:
6393700
1000
for blastn-short
Runs BLASTn using the blastn-short
task for sequences < 30bp.
Look for top match (-max_hsps 1
& -max_target_seqs 1
) for each query.
Examining 5 or more matches is recommended
by redirecting stdout: 2> /dev/null
# Load bash variables into memory
source .bashvars
time \
${ncbi_blastn} \
-db ${blast_dbs_dir}/${mirbase_mature_fasta_no_U%.*} \
-query ${output_dir_top}/${collapsed_reads_fasta} \
-out ${output_dir_top}/miRBase-BLASTn-eval_1000.outfmt6 \
-task blastn-short \
-max_hsps 1 \
-max_target_seqs 1 \
-outfmt 6 \
-num_threads ${threads} \
2> /dev/null
# Load bash variables into memory
source .bashvars
time \
${ncbi_blastn} \
-db ${blast_dbs_dir}/${mirgene_mature_fasta_no_U%.*} \
-query ${output_dir_top}/${collapsed_reads_fasta} \
-out ${output_dir_top}/MirGene-BLASTn-eval_1000.outfmt6 \
-task blastn-short \
-max_hsps 1 \
-max_target_seqs 1 \
-outfmt 6 \
-num_threads ${threads} \
2> /dev/null
Running this for simple comparison to the defaul blastn-short
value of 1000.
# Load bash variables into memory
source .bashvars
time \
${ncbi_blastn} \
-db ${blast_dbs_dir}/${mirbase_mature_fasta_no_U%.*} \
-query ${output_dir_top}/${collapsed_reads_fasta} \
-out ${output_dir_top}/miRBase-BLASTn-eval_10.outfmt6 \
-task blastn-short \
-evalue 10 \
-max_hsps 1 \
-max_target_seqs 1 \
-outfmt 6 \
-num_threads ${threads} \
2> /dev/null
# Load bash variables into memory
source .bashvars
time \
${ncbi_blastn} \
-db ${blast_dbs_dir}/${mirgene_mature_fasta_no_U%.*} \
-query ${output_dir_top}/${collapsed_reads_fasta} \
-out ${output_dir_top}/MirGene-BLASTn-eval_10.outfmt6 \
-task blastn-short \
-evalue 10 \
-max_hsps 1 \
-max_target_seqs 1 \
-outfmt 6 \
-num_threads ${threads} \
2> /dev/null
Running this for simple comparison to the default blastn-short
value of 1000.
# Load bash variables into memory
source .bashvars
time \
${ncbi_blastn} \
-db ${blast_dbs_dir}/${mirbase_mature_fasta_no_U%.*} \
-query ${output_dir_top}/${collapsed_reads_fasta} \
-out ${output_dir_top}/cindarian-miRBase-BLASTn-eval_1.outfmt6 \
-task blastn-short \
-evalue 1 \
-max_hsps 1 \
-max_target_seqs 1 \
-outfmt 6 \
-num_threads ${threads} \
2> /dev/null
# Load bash variables into memory
source .bashvars
time \
${ncbi_blastn} \
-db ${blast_dbs_dir}/${mirgene_mature_fasta_no_U%.*} \
-query ${output_dir_top}/${collapsed_reads_fasta} \
-out ${output_dir_top}/MirGene-BLASTn-eval_1.outfmt6 \
-task blastn-short \
-evalue 1 \
-max_hsps 1 \
-max_target_seqs 1 \
-outfmt 6 \
-num_threads ${threads} \
2> /dev/null
# Load bash variables into memory
source .bashvars
head ${output_dir_top}/*eval_1000.outfmt6
echo ""
echo "-----------------------------------------"
echo ""
wc -l ${output_dir_top}/*eval_1000.outfmt6
==> /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/F-Pmea/output/10.1-Pmea-sRNAseq-BLASTn-31bp-fastp-merged-cnidarian_miRBase/miRBase-BLASTn-eval_1000.outfmt6 <==
1-10257663 ssc-miR-7143-5p 100.000 11 0 0 12 22 8 18 1.5 22.3
2-2952210 cpo-miR-1379-5p 100.000 12 0 0 1 12 18 7 0.26 24.3
3-1527059 ppc-miR-8214-5p 100.000 13 0 0 12 24 18 6 0.11 26.3
4-1205244 ppc-miR-8214-5p 100.000 13 0 0 12 24 18 6 0.11 26.3
5-1189262 cpo-miR-1379-5p 100.000 12 0 0 1 12 18 7 0.26 24.3
6-630167 cpo-miR-509c-3p 100.000 12 0 0 13 24 8 19 0.37 24.3
7-548099 ppc-miR-8214-5p 100.000 13 0 0 12 24 18 6 0.11 26.3
8-512116 hpo-miR-10025-5p 100.000 10 0 0 11 20 17 8 4.9 20.3
9-432671 cpo-miR-1379-5p 100.000 12 0 0 1 12 18 7 0.34 24.3
10-407718 ppc-miR-8214-5p 100.000 13 0 0 12 24 18 6 0.11 26.3
==> /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/F-Pmea/output/10.1-Pmea-sRNAseq-BLASTn-31bp-fastp-merged-cnidarian_miRBase/MirGene-BLASTn-eval_1000.outfmt6 <==
1-10257663 Tca-Mir-87-P2_3p 100.000 11 0 0 5 15 13 3 0.54 22.3
2-2952210 Ovu-Novel-55_5p 100.000 9 0 0 10 18 14 6 7.3 18.3
3-1527059 Pma-Mir-96-P3o2_5p 100.000 11 0 0 16 26 9 19 0.66 22.3
4-1205244 Pma-Mir-96-P3o2_5p 100.000 11 0 0 16 26 9 19 0.78 22.3
5-1189262 Obi-Novel-51_3p 100.000 9 0 0 5 13 7 15 7.3 18.3
6-630167 Cpo-Mir-506-o3_3p 100.000 12 0 0 13 24 8 19 0.14 24.3
7-548099 Pma-Mir-96-P3o2_5p 100.000 11 0 0 16 26 9 19 0.62 22.3
8-512116 Ovu-Novel-18_5p 92.308 13 1 0 7 19 21 9 8.6 18.3
9-432671 Mml-Mir-2114_5p 100.000 11 0 0 13 23 11 21 0.51 22.3
10-407718 Pma-Mir-96-P3o2_5p 100.000 11 0 0 16 26 9 19 0.78 22.3
-----------------------------------------
6393700 /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/F-Pmea/output/10.1-Pmea-sRNAseq-BLASTn-31bp-fastp-merged-cnidarian_miRBase/miRBase-BLASTn-eval_1000.outfmt6
6393700 /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/F-Pmea/output/10.1-Pmea-sRNAseq-BLASTn-31bp-fastp-merged-cnidarian_miRBase/MirGene-BLASTn-eval_1000.outfmt6
12787400 total
# Load bash variables into memory
source .bashvars
head ${output_dir_top}/*eval_10.outfmt6
echo ""
echo "-----------------------------------------"
echo ""
wc -l ${output_dir_top}/*eval_10.outfmt6
==> /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/F-Pmea/output/10.1-Pmea-sRNAseq-BLASTn-31bp-fastp-merged-cnidarian_miRBase/miRBase-BLASTn-eval_10.outfmt6 <==
1-10257663 ssc-miR-7143-5p 100.000 11 0 0 12 22 8 18 1.5 22.3
2-2952210 cpo-miR-1379-5p 100.000 12 0 0 1 12 18 7 0.26 24.3
3-1527059 ppc-miR-8214-5p 100.000 13 0 0 12 24 18 6 0.11 26.3
4-1205244 ppc-miR-8214-5p 100.000 13 0 0 12 24 18 6 0.11 26.3
5-1189262 cpo-miR-1379-5p 100.000 12 0 0 1 12 18 7 0.26 24.3
6-630167 cpo-miR-509c-3p 100.000 12 0 0 13 24 8 19 0.37 24.3
7-548099 ppc-miR-8214-5p 100.000 13 0 0 12 24 18 6 0.11 26.3
8-512116 hpo-miR-10025-5p 100.000 10 0 0 11 20 17 8 4.9 20.3
9-432671 cpo-miR-1379-5p 100.000 12 0 0 1 12 18 7 0.34 24.3
10-407718 ppc-miR-8214-5p 100.000 13 0 0 12 24 18 6 0.11 26.3
==> /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/F-Pmea/output/10.1-Pmea-sRNAseq-BLASTn-31bp-fastp-merged-cnidarian_miRBase/MirGene-BLASTn-eval_10.outfmt6 <==
1-10257663 Tca-Mir-87-P2_3p 100.000 11 0 0 5 15 13 3 0.54 22.3
2-2952210 Ovu-Novel-55_5p 100.000 9 0 0 10 18 14 6 7.3 18.3
3-1527059 Pma-Mir-96-P3o2_5p 100.000 11 0 0 16 26 9 19 0.66 22.3
4-1205244 Pma-Mir-96-P3o2_5p 100.000 11 0 0 16 26 9 19 0.78 22.3
5-1189262 Obi-Novel-51_3p 100.000 9 0 0 5 13 7 15 7.3 18.3
6-630167 Cpo-Mir-506-o3_3p 100.000 12 0 0 13 24 8 19 0.14 24.3
7-548099 Pma-Mir-96-P3o2_5p 100.000 11 0 0 16 26 9 19 0.62 22.3
8-512116 Ovu-Novel-18_5p 92.308 13 1 0 7 19 21 9 8.6 18.3
9-432671 Mml-Mir-2114_5p 100.000 11 0 0 13 23 11 21 0.51 22.3
10-407718 Pma-Mir-96-P3o2_5p 100.000 11 0 0 16 26 9 19 0.78 22.3
-----------------------------------------
6367488 /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/F-Pmea/output/10.1-Pmea-sRNAseq-BLASTn-31bp-fastp-merged-cnidarian_miRBase/miRBase-BLASTn-eval_10.outfmt6
6010885 /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/F-Pmea/output/10.1-Pmea-sRNAseq-BLASTn-31bp-fastp-merged-cnidarian_miRBase/MirGene-BLASTn-eval_10.outfmt6
12378373 total
# Load bash variables into memory
source .bashvars
head ${output_dir_top}/*eval_1.outfmt6
echo ""
echo "-----------------------------------------"
echo ""
wc -l ${output_dir_top}/*eval_1.outfmt6
==> /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/F-Pmea/output/10.1-Pmea-sRNAseq-BLASTn-31bp-fastp-merged-cnidarian_miRBase/cindarian-miRBase-BLASTn-eval_1.outfmt6 <==
2-2952210 cpo-miR-1379-5p 100.000 12 0 0 1 12 18 7 0.26 24.3
3-1527059 ppc-miR-8214-5p 100.000 13 0 0 12 24 18 6 0.11 26.3
4-1205244 ppc-miR-8214-5p 100.000 13 0 0 12 24 18 6 0.11 26.3
5-1189262 cpo-miR-1379-5p 100.000 12 0 0 1 12 18 7 0.26 24.3
6-630167 cpo-miR-509c-3p 100.000 12 0 0 13 24 8 19 0.37 24.3
7-548099 ppc-miR-8214-5p 100.000 13 0 0 12 24 18 6 0.11 26.3
9-432671 cpo-miR-1379-5p 100.000 12 0 0 1 12 18 7 0.34 24.3
10-407718 ppc-miR-8214-5p 100.000 13 0 0 12 24 18 6 0.11 26.3
11-398433 ppc-miR-8214-5p 100.000 13 0 0 11 23 18 6 0.11 26.3
12-361684 spi-miR-L-temp-15_Stylophora_pistillata_Praher_et_al._2021_NA 100.000 22 0 0 1 22 1 22 3.12e-07 44.1
==> /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/F-Pmea/output/10.1-Pmea-sRNAseq-BLASTn-31bp-fastp-merged-cnidarian_miRBase/MirGene-BLASTn-eval_1.outfmt6 <==
1-10257663 Tca-Mir-87-P2_3p 100.000 11 0 0 5 15 13 3 0.54 22.3
3-1527059 Pma-Mir-96-P3o2_5p 100.000 11 0 0 16 26 9 19 0.66 22.3
4-1205244 Pma-Mir-96-P3o2_5p 100.000 11 0 0 16 26 9 19 0.78 22.3
6-630167 Cpo-Mir-506-o3_3p 100.000 12 0 0 13 24 8 19 0.14 24.3
7-548099 Pma-Mir-96-P3o2_5p 100.000 11 0 0 16 26 9 19 0.62 22.3
9-432671 Mml-Mir-2114_5p 100.000 11 0 0 13 23 11 21 0.51 22.3
10-407718 Pma-Mir-96-P3o2_5p 100.000 11 0 0 16 26 9 19 0.78 22.3
11-398433 Pma-Mir-96-P3o2_5p 100.000 11 0 0 15 25 9 19 0.62 22.3
12-361684 Xla-Mir-456-P2_3p 100.000 11 0 0 9 19 2 12 0.51 22.3
13-339071 Pma-Mir-96-P3o2_5p 100.000 11 0 0 15 25 9 19 0.74 22.3
-----------------------------------------
2914563 /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/F-Pmea/output/10.1-Pmea-sRNAseq-BLASTn-31bp-fastp-merged-cnidarian_miRBase/cindarian-miRBase-BLASTn-eval_1.outfmt6
2130019 /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/F-Pmea/output/10.1-Pmea-sRNAseq-BLASTn-31bp-fastp-merged-cnidarian_miRBase/MirGene-BLASTn-eval_1.outfmt6
5044582 total
The default e-value results in alignments for 100% of sequences.
Decreasing the e-value to 10
resulted in fewer query alignments, as we’d expect, but only minimally.
Decreasing the e-value to 1
resulted in ~40-45% of query alignments.
For further analysis, we should probably discuss a reasonable e-value to use for filtering, as it might be worthwhile to test even lower e-value thresholds.
For further analysis, we should probably discuss a reasonable e-value to use for filtering.