Use MirMachine (Umu et al. 2022) to identify potential miRNA homologs in the P.meandrina genome.
Inputs:
Outputs:
Software requirements:
Replace with name of your MirMachine environment and the path to the corresponding conda installation (find this after you’ve activated the environment).
E.g.
# Activate environment
conda activate ShortStack4_env
# Find conda path
which conda
mirmachine_conda_env_name <- c("mirmachine_env")
mirmachine_cond_path <- c("/home/sam/programs/mambaforge/condabin/conda")
This allows usage of Bash variables across R Markdown chunks.
{
echo "#### Assign Variables ####"
echo ""
echo "# Data directories"
echo 'export deep_dive_dir=/home/shared/8TB_HDD_02/shedurkin/deep-dive'
echo 'export output_dir_top=${deep_dive_dir}/F-Pmea/output/12-Pmea-sRNAseq-MirMachine'
echo ""
echo 'export mirmarchine_output_prefix="Pmea-MirMachine"'
echo "# Input/Output files"
echo 'export genome_fasta_dir=${deep_dive_dir}/F-Pmea/data/Pmea/'
echo 'export genome_fasta_name="Pocillopora_meandrina_HIv1.assembly.fasta"'
echo 'export genome_fasta="${genome_fasta_dir}/${genome_fasta_name}"'
echo ""
echo "# External data URLs"
echo 'export genome_fasta_url="http://cyanophora.rutgers.edu/Pocillopora_meandrina/Pocillopora_meandrina_HIv1.assembly.fasta.gz"'
echo ""
echo "# Set number of CPUs to use"
echo 'export threads=40'
echo ""
} > .bashvars
cat .bashvars
#### Assign Variables ####
# Data directories
export deep_dive_dir=/home/shared/8TB_HDD_02/shedurkin/deep-dive
export output_dir_top=${deep_dive_dir}/F-Pmea/output/12-Pmea-sRNAseq-MirMachine
export mirmarchine_output_prefix="Pmea-MirMachine"
# Input/Output files
export genome_fasta_dir=${deep_dive_dir}/F-Pmea/data/Pmea/
export genome_fasta_name="Pocillopora_meandrina_HIv1.assembly.fasta"
export genome_fasta="${genome_fasta_dir}/${genome_fasta_name}"
# External data URLs
export genome_fasta_url="http://cyanophora.rutgers.edu/Pocillopora_meandrina/Pocillopora_meandrina_HIv1.assembly.fasta.gz"
# Set number of CPUs to use
export threads=40
If this is successful, the first line of output should show that the Python being used is the one in your MirMachine conda environment path.
E.g.
python: /home/sam/programs/mambaforge/envs/mirmachine_env/bin/python
use_condaenv(condaenv = mirmachine_conda_env_name, conda = mirmachine_cond_path)
# Check successful env loading
py_config()
python: /home/sam/programs/mambaforge/envs/mirmachine_env/bin/python
libpython: /home/sam/programs/mambaforge/envs/mirmachine_env/lib/libpython3.10.so
pythonhome: /home/sam/programs/mambaforge/envs/mirmachine_env:/home/sam/programs/mambaforge/envs/mirmachine_env
version: 3.10.12 | packaged by conda-forge | (main, Jun 23 2023, 22:40:32) [GCC 12.3.0]
numpy: /home/sam/programs/mambaforge/envs/mirmachine_env/lib/python3.10/site-packages/numpy
numpy_version: 1.25.2
NOTE: Python version was forced by use_python() function
http://cyanophora.rutgers.edu/Pocillopora_meandrina/Pocillopora_meandrina_HIv1.assembly.fasta.gz
)# Load bash variables into memory
source .bashvars
wget \
--directory-prefix ${genome_fasta_dir} \
--recursive \
--no-check-certificate \
--continue \
--no-host-directories \
--no-directories \
--no-parent \
--quiet \
--execute robots=off \
--accept "${genome_fasta_name}.gz,md5checksums.txt" ${genome_fasta_url}
ls -lh "${genome_fasta_dir}"
total 819M
-rw-r--r-- 1 shedurkin labmembers 360M Nov 30 07:59 Pocillopora_meandrina_HIv1.assembly.fa
-rw-r--r-- 1 shedurkin labmembers 360M May 23 2022 Pocillopora_meandrina_HIv1.assembly.fasta
-rw-r--r-- 1 shedurkin labmembers 101M May 23 2022 Pocillopora_meandrina_HIv1.assembly.fasta.gz
# Load bash variables into memory
source .bashvars
cd "${genome_fasta_dir}"
# Checksums file contains other files, so this just looks for the sRNAseq files.
grep "${genome_fasta_name}" md5checksums.txt | md5sum --check
# Load bash variables into memory
source .bashvars
cd "${genome_fasta_dir}"
gunzip --keep "${genome_fasta_name}.gz"
ls -lth
gzip: Pocillopora_meandrina_HIv1.assembly.fasta already exists; not overwritten
total 819M
-rw-r--r-- 1 shedurkin labmembers 360M Nov 30 07:59 Pocillopora_meandrina_HIv1.assembly.fa
-rw-r--r-- 1 shedurkin labmembers 360M May 23 2022 Pocillopora_meandrina_HIv1.assembly.fasta
-rw-r--r-- 1 shedurkin labmembers 101M May 23 2022 Pocillopora_meandrina_HIv1.assembly.fasta.gz
The --species
option specifies output naming structure
# Load bash variables into memory
source .bashvars
cd "${output_dir_top}"
time \
MirMachine.py \
--node Metazoa \
--species ${mirmarchine_output_prefix} \
--genome ${genome_fasta} \
--cpu 40 \
--add-all-nodes \
&> mirmachine.log
real 0m2.359s
user 0m1.902s
sys 0m0.483s
# Load bash variables into memory
source .bashvars
tree -h ${output_dir_top}/results/predictions
/home/shared/8TB_HDD_02/shedurkin/deep-dive/F-Pmea/output/12-Pmea-sRNAseq-MirMachine/results/predictions
├── [4.0K] fasta
│ └── [ 39K] Pmea-MirMachine.PRE.fasta
├── [4.0K] filtered_gff
│ └── [ 18K] Pmea-MirMachine.PRE.gff
├── [4.0K] gff
│ └── [ 75K] Pmea-MirMachine.PRE.gff
└── [4.0K] heatmap
└── [ 30K] Pmea-MirMachine.heatmap.csv
4 directories, 4 files
# Load bash variables into memory
source .bashvars
echo "Total number of predicted miRNAs (high & low confidence):"
grep --count "^>" "${output_dir_top}/results/predictions/fasta/${mirmarchine_output_prefix}.PRE.fasta"
echo ""
echo "----------------------------------------------------------------------------------"
echo ""
head "${output_dir_top}/results/predictions/fasta/${mirmarchine_output_prefix}.PRE.fasta"
Total number of predicted miRNAs (high & low confidence):
261
----------------------------------------------------------------------------------
>Iab-4.PRE_Pocillopora_meandrina_HIv1___Sc0000020_3039233_3039278_(-)_LOWconf
CAGTATACTTAAAGTATACTTTAAAGTATACTTTAAGTATACTTAG
>Mir-10.PRE_Pocillopora_meandrina_HIv1___Sc0000003_10366054_10366110_(+)_HIGHconf
AACCCGTAGATCCGAACTTGTGGGAATTTCTCACCACAGGTTCGTGTCTATGGTCCA
>Mir-1175.PRE_Pocillopora_meandrina_HIv1___Sc0000005_198860_198924_(+)_LOWconf
AGTTAAAGGTGTAATTTCTTACAGTGGGTAATATATATACTGTAAGATTTTACGCCTTTAATTAA
>Mir-124.PRE_Pocillopora_meandrina_HIv1___Sc0000040_962575_962635_(+)_LOWconf
AGGAGTCACTTAGGGCCTTATTCAAAATAATGTTTGGAATAAGGCCCTAGGTGACTCCTGA
>Mir-124.PRE_Pocillopora_meandrina_HIv1___Sc0000040_1120043_1120103_(+)_LOWconf
AGGAGTCACTTAGGGCCTTATTCAAAATAATGTTTGGAATAAGGCCCTAGGTGACTCCTGA
Per MirMachine recommendations:
filtered_gff/
High confidence miRNA family predictions after bitscore filtering. (This file is what you need in most cases)
So, we’ll stick with that.
# Load bash variables into memory
source .bashvars
# Avoid 10th line - lengthy list of all miRNA families examined
head -n 9 "${output_dir_top}/results/predictions/filtered_gff/${mirmarchine_output_prefix}.PRE.gff"
echo ""
# Pull out lines that do _not_ begin with a '#'.
grep "^[^#]" "${output_dir_top}/results/predictions/filtered_gff/${mirmarchine_output_prefix}.PRE.gff" \
| head
##gff-version 3
# MirMachine version: 0.2.12
# CM Models: Built using MirGeneDB 2.1(2022)
# Total families searched: 556
# Node: Metazoa
# Model: combined
# Genome file: /home/shared/8TB_HDD_02/shedurkin/deep-dive/F-Pmea/data/Pmea//Pocillopora_meandrina_HIv1.assembly.fasta
# Species: Pmea-MirMachine
# Params: /home/sam/programs/mambaforge/envs/mirmachine_env/bin/MirMachine.py --node Metazoa --species Pmea-MirMachine --genome /home/shared/8TB_HDD_02/shedurkin/deep-dive/F-Pmea/data/Pmea//Pocillopora_meandrina_HIv1.assembly.fasta --cpu 40 --add-all-nodes
Pocillopora_meandrina_HIv1___Sc0000003 cmsearch ncRNA 10366054 10366110 47.3 + . gene_id=Mir-10.PRE;E-value=2e-06;sequence_with_30nt=CAGTCACGGAAAGATCCAGCGGTTCATCTGAACCCGTAGATCCGAACTTGTGGGAATTTCTCACCACAGGTTCGTGTCTATGGTCCACGTGTACTGCATTACTACAATGAAACTTAA
Pocillopora_meandrina_HIv1___Sc0000000 cmsearch ncRNA 10651117 10651192 35.4 + . gene_id=Mir-303.PRE;E-value=0.0081;sequence_with_30nt=GGTTTTTTAACTAATCTACACTTTTGACGCTCTACACTTTCAGAGAAGCCCAATTTCGTTACCAACGTTGGTTGGTAACGAAATTGGGCTTCTCTGAAATTTAGATCACAAAAAACAGCGTTTTTCCTTTTTCGTT
Pocillopora_meandrina_HIv1___Sc0000000 cmsearch ncRNA 10655950 10656025 35.4 + . gene_id=Mir-303.PRE;E-value=0.0081;sequence_with_30nt=GGTTTTTTAACTAATCTACACTTTTGACGCTCTACACTTTCAGAGAAGCCCAATTTCGTTACCAACGTTGGTTGGTAACGAAATTGGGCTTCTCTGAAATTTAGATCACAAAAAACAGCGTTTTTCCTTTTTCGTT
Pocillopora_meandrina_HIv1___Sc0000000 cmsearch ncRNA 10660782 10660857 35.4 + . gene_id=Mir-303.PRE;E-value=0.0081;sequence_with_30nt=GGTTTTTTAACTAATCTACACTTTTGACGCTCTACACTTTCAGAGAAGCCCAATTTCGTTACCAACGTTGGTTGGTAACGAAATTGGGCTTCTCTGAAATTTAGATCACAAAAAACAGCGTTTTTCCTTTTTCGTT
Pocillopora_meandrina_HIv1___Sc0000000 cmsearch ncRNA 10666666 10666741 35.4 - . gene_id=Mir-303.PRE;E-value=0.0081;sequence_with_30nt=GGTTTTTTAACTAATCTACACTTTTGACGCTCTACACTTTCAGAGAAGCCCAATTTCGTTACCAACGTTGGTTGGTAACGAAATTGGGCTTCTCTGAAATTTAGATCACAAAAAACAGCGTTTTTCCTTTTTCGTT
Pocillopora_meandrina_HIv1___Sc0000003 cmsearch ncRNA 15185321 15185383 39.1 + . gene_id=Mir-303.PRE;E-value=0.00083;sequence_with_30nt=TCTTCTTCTTCTTCTTCGAATGCTCAGACGATAGCTTCAGTGGAAATATGTTTGTTTACTTGTAAACAAACATATTTCCACTGAAGCATTTTTATCGGTCGTTTGATTGTTCTGTCTATTCTG
Pocillopora_meandrina_HIv1___Sc0000000 cmsearch ncRNA 3084744 3084853 35.4 + . gene_id=Mir-9388.PRE;E-value=0.0088;sequence_with_30nt=ATAAGTACGTACGCATTCGTGCGGACTTACGTGCGTGCGTGCGTACATGCGTGCGTGTGTGCGTACATGCGTACGTGCGTGCGTGCGTACGCACGGATGTACGCACCTACGTACGCACGAATGTACGCACGTACGCATGCACGTGCGTACGTACGCATGTTTGTACGTAC
Pocillopora_meandrina_HIv1___Sc0000000 cmsearch ncRNA 19092618 19092677 41.6 + . gene_id=Mir-9388.PRE;E-value=0.00021;sequence_with_30nt=GTATATATATACATGTATATGTATATATATGTGTGTGTATATATATATATATATATATATGTATGCATATGTATATATATACACACACACACACACACATATATATATACACCTGTTCAA
Pocillopora_meandrina_HIv1___Sc0000002 cmsearch ncRNA 645851 645914 38.5 - . gene_id=Mir-9388.PRE;E-value=0.0014;sequence_with_30nt=ATATATATATATATATATACATATATACATATACATATACATATACATATATATACATATACATATGTATATATATGTATATGTATATGTATGTATATATATATATATATATATACATCTATGA
Pocillopora_meandrina_HIv1___Sc0000003 cmsearch ncRNA 4966110 4966167 40.3 + . gene_id=Mir-9388.PRE;E-value=0.00048;sequence_with_30nt=TTCGCTCAATTCTATGACGAATTATGGGTGGTACGTACGTGCGTACGTATGCACGTACGTACGTGCGTACGTATGCACGTACGTACATGCGTTGTTTTTAGAGATCCGTTGTTACTTA
# Load bash variables into memory
source .bashvars
# Skip lines which begin with '#' (i.e. the header components)
echo "Predicted loci:"
grep -c "^[^#]" "${output_dir_top}/results/predictions/filtered_gff/${mirmarchine_output_prefix}.PRE.gff"
echo ""
echo "Unique miRNA families:"
grep "^[^#]" "${output_dir_top}/results/predictions/filtered_gff/${mirmarchine_output_prefix}.PRE.gff" \
| awk -F"[\t=;]" '{print $10}' \
| sort -u \
| wc -l
Predicted loci:
40
Unique miRNA families:
7