Use MirMachine (Umu et al. 2022) to identify potential miRNA homologs in the A.millepora genome.
The A.millepora genome will be used as the reference genome for A.pulchra, as A.pulchra does not currently have a sequenced genome and A.millepora had highest alignment rates for standard RNAseq data compared to other published genomes tested.
Inputs:
Outputs:
Software requirements:
Replace with name of your MirMachine environment and the path to the corresponding conda installation (find this after you’ve activated the environment).
E.g.
# Activate environment
conda activate ShortStack4_env
# Find conda path
which conda
mirmachine_conda_env_name <- c("mirmachine_env")
mirmachine_cond_path <- c("/home/sam/programs/mambaforge/condabin/conda")
This allows usage of Bash variables across R Markdown chunks.
{
echo "#### Assign Variables ####"
echo ""
echo "# Data directories"
echo 'export deep_dive_dir=/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive'
echo 'export output_dir_top=${deep_dive_dir}/D-Apul/output/12-Apul-sRNAseq-MirMachine'
echo ""
echo 'export mirmarchine_output_prefix="Amil-MirMachine"'
echo "# Input/Output files"
echo 'export genome_fasta_dir=${deep_dive_dir}/D-Apul/data/Amil/ncbi_dataset/data/GCF_013753865.1'
echo 'export genome_fasta_name="GCF_013753865.1_Amil_v2.1_genomic.fna"'
echo 'export genome_fasta="${genome_fasta_dir}/${genome_fasta_name}"'
echo ""
echo "# External data URLs"
echo 'export genome_fasta_url="https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/013/753/865/GCF_013753865.1_Amil_v2.1/"'
echo ""
echo "# Set number of CPUs to use"
echo 'export threads=40'
echo ""
} > .bashvars
cat .bashvars
#### Assign Variables ####
# Data directories
export deep_dive_dir=/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive
export output_dir_top=${deep_dive_dir}/D-Apul/output/12-Apul-sRNAseq-MirMachine
export mirmarchine_output_prefix="Amil-MirMachine"
# Input/Output files
export genome_fasta_dir=${deep_dive_dir}/D-Apul/data/Amil/ncbi_dataset/data/GCF_013753865.1
export genome_fasta_name="GCF_013753865.1_Amil_v2.1_genomic.fna"
export genome_fasta="${genome_fasta_dir}/${genome_fasta_name}"
# External data URLs
export genome_fasta_url="https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/013/753/865/GCF_013753865.1_Amil_v2.1/"
# Set number of CPUs to use
export threads=40
If this is successful, the first line of output should show that the Python being used is the one in your MirMachine conda environment path.
E.g.
python: /home/sam/programs/mambaforge/envs/mirmachine_env/bin/python
use_condaenv(condaenv = mirmachine_conda_env_name, conda = mirmachine_cond_path)
# Check successful env loading
py_config()
python: /home/sam/programs/mambaforge/envs/mirmachine_env/bin/python
libpython: /home/sam/programs/mambaforge/envs/mirmachine_env/lib/libpython3.10.so
pythonhome: /home/sam/programs/mambaforge/envs/mirmachine_env:/home/sam/programs/mambaforge/envs/mirmachine_env
version: 3.10.12 | packaged by conda-forge | (main, Jun 23 2023, 22:40:32) [GCC 12.3.0]
numpy: /home/sam/programs/mambaforge/envs/mirmachine_env/lib/python3.10/site-packages/numpy
numpy_version: 1.25.2
NOTE: Python version was forced by use_python() function
GCF_013753865.1_Amil_v2.1
)# Load bash variables into memory
source .bashvars
wget \
--directory-prefix ${genome_fasta_dir} \
--recursive \
--no-check-certificate \
--continue \
--no-host-directories \
--no-directories \
--no-parent \
--quiet \
--execute robots=off \
--accept "${genome_fasta_name}.gz,md5checksums.txt" ${genome_fasta_url}
ls -lh "${genome_fasta_dir}"
total 141M
-rw-r--r-- 1 sam sam 141M Oct 14 2021 GCF_013753865.1_Amil_v2.1_genomic.fna.gz
-rw-r--r-- 1 sam sam 13K Oct 14 2021 md5checksums.txt
# Load bash variables into memory
source .bashvars
cd "${genome_fasta_dir}"
# Checksums file contains other files, so this just looks for the sRNAseq files.
grep "${genome_fasta_name}" md5checksums.txt | md5sum --check
./GCF_013753865.1_Amil_v2.1_genomic.fna.gz: OK
# Load bash variables into memory
source .bashvars
cd "${genome_fasta_dir}"
gunzip --keep "${genome_fasta_name}.gz"
ls -lth
total 601M
-rw-r--r-- 1 sam sam 13K Oct 14 2021 md5checksums.txt
-rw-r--r-- 1 sam sam 460M Oct 14 2021 GCF_013753865.1_Amil_v2.1_genomic.fna
-rw-r--r-- 1 sam sam 141M Oct 14 2021 GCF_013753865.1_Amil_v2.1_genomic.fna.gz
The --species
option specifies output naming structure
# Load bash variables into memory
source .bashvars
cd "${output_dir_top}"
time \
MirMachine.py \
--node Metazoa \
--species ${mirmarchine_output_prefix} \
--genome ${genome_fasta} \
--cpu 40 \
--add-all-nodes \
&> mirmachine.log
real 31m24.827s
user 863m38.571s
sys 14m22.499s
# Load bash variables into memory
source .bashvars
tree -h ${output_dir_top}/results/predictions
/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/D-Apul/output/12-Apul-sRNAseq-MirMachine/results/predictions
├── [4.0K] fasta
│ └── [108K] Amil-MirMachine.PRE.fasta
├── [4.0K] filtered_gff
│ └── [ 35K] Amil-MirMachine.PRE.gff
├── [4.0K] gff
│ └── [211K] Amil-MirMachine.PRE.gff
└── [4.0K] heatmap
└── [ 30K] Amil-MirMachine.heatmap.csv
4 directories, 4 files
# Load bash variables into memory
source .bashvars
echo "Total number of predicted miRNAs (high & low confidence):"
grep --count "^>" "${output_dir_top}/results/predictions/fasta/${mirmarchine_output_prefix}.PRE.fasta"
echo ""
echo "----------------------------------------------------------------------------------"
echo ""
head "${output_dir_top}/results/predictions/fasta/${mirmarchine_output_prefix}.PRE.fasta"
Total number of predicted miRNAs (high & low confidence):
870
----------------------------------------------------------------------------------
>Bantam.PRE_NC_058071.1_9771121_9771177_(-)_LOWconf
TGATTTTCTAATGGTCTTGATTTGAATTATTCAAATTAAGACCATTAGAAAATCAAT
>Iab-4.PRE_NC_058069.1_2974897_2974978_(-)_LOWconf
accaatacagaaagtatcccaaacacttgtgaAAGCTagccttaggcctaattagtgtttgggatactttctgtattggtaa
>Iab-4.PRE_NC_058075.1_7368350_7368473_(+)_LOWconf
accaatacagaaagtatcctaaacacttgtaaaagctaaccttaggcctaattaggcctaaacagacagttttaggcctaaggttagcttttaaaagtgtttgggatactttctgtattggtaa
>Iab-4.PRE_NC_058079.1_10689081_10689141_(+)_LOWconf
AAGTATACTTAAAGTATACTTCAAGTATACTTCCTTTTAGGTATACTTTAAGTATACTTTT
>Iab-4.PRE_NW_025322626.1_797278_797371_(-)_LOWconf
accaatacagaaagtatcctaaacattcataaaagctaaccttaggcctaaagtTTGCTTtcatgaatgtttaggatactttctgtattggtaa
Per MirMachine recommendations:
filtered_gff/
High confidence miRNA family predictions after bitscore filtering. (This file is what you need in most cases)
So, we’ll stick with that.
# Load bash variables into memory
source .bashvars
# Avoid 10th line - lengthy list of all miRNA families examined
head -n 9 "${output_dir_top}/results/predictions/filtered_gff/${mirmarchine_output_prefix}.PRE.gff"
echo ""
# Pull out lines that do _not_ begin with a '#'.
grep "^[^#]" "${output_dir_top}/results/predictions/filtered_gff/${mirmarchine_output_prefix}.PRE.gff" \
| head
##gff-version 3
# MirMachine version: 0.2.12
# CM Models: Built using MirGeneDB 2.1(2022)
# Total families searched: 556
# Node: Metazoa
# Model: combined
# Genome file: /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/D-Apul/data/Amil/ncbi_dataset/data/GCF_013753865.1/GCF_013753865.1_Amil_v2.1_genomic.fna
# Species: Amil-MirMachine
# Params: /home/sam/programs/mambaforge/envs/mirmachine_env/bin/MirMachine.py --node Metazoa --species Amil-MirMachine --genome /home/shared/8TB_HDD_01/sam/gitrepos/deep-dive/D-Apul/data/Amil/ncbi_dataset/data/GCF_013753865.1/GCF_013753865.1_Amil_v2.1_genomic.fna --cpu 40 --add-all-nodes
NC_058073.1 cmsearch ncRNA 12437105 12437152 41.9 + . gene_id=Mir-10.PRE;E-value=6.4e-05;sequence_with_30nt=CGGTGGAATCTTCTGGCACAGCACGTGATCCCGTAGATCCGAACTTGTGGGTTTTCTTCCACAAGTTCGTCTCTATGGTTTACGTGGTTTGCATTGAAAACGCATCCT
NC_058067.1 cmsearch ncRNA 4022302 4022373 38.6 - . gene_id=Mir-2.PRE;E-value=7.8e-05;sequence_with_30nt=tgtatttaaatttaaattaaacaaaacaaaacatcaaaacTGCTGGTGATATTTctgaataattgttattattcagAAATATCACAAGCAGTtttgatgttttgtttgtttgttttatttttaattatcaCA
NC_058067.1 cmsearch ncRNA 34950489 34950547 37.2 + . gene_id=Mir-2.PRE;E-value=0.0002;sequence_with_30nt=AagcaaataacaataattgttacagTGTGACAGTGAAAACTGGCAatgatgcaataattattattgcatcaTTGCCAgttttcactgtcacaccatcttcaaaaccattcaagaaaatt
NC_058069.1 cmsearch ncRNA 1062615 1062677 31.8 + . gene_id=Mir-2.PRE;E-value=0.0082;sequence_with_30nt=ttggacaattcaaacccttgaagtgatatgatatcatttcactgcagtgaaatgataataaattatcatatcactgcagtgaaatgatatcatatcacttccagcgtatcatttttattcacg
NC_058078.1 cmsearch ncRNA 18791646 18791701 33.3 + . gene_id=Mir-2.PRE;E-value=0.003;sequence_with_30nt=aaattggcggtgaaacttttttaaaagttttaaaaaaaattggcagtgaaacttttttaaaagtttcactgccaattttttttatggaaatGAAAACAATCATAAATTGAGGGTTA
NC_058070.1 cmsearch ncRNA 30334965 30335020 48.4 + . gene_id=Mir-2189.PRE;E-value=4.5e-06;sequence_with_30nt=aataattattatataaatattataCTCGCCACAGGTGAcccacacaataataataataattattttattgtgtgGGTCACCTGTGGCAAGTTAAAAGATGTGAACTTAATCCCACT
NW_025322780.1 cmsearch ncRNA 310662 310720 36.0 + . gene_id=Mir-2189.PRE;E-value=0.0077;sequence_with_30nt=gatacgtttgtcaaataatttgtatcgcaggcgatgtacaaagacaaaacaaaatgataattattttgttttgtctctgtacatcgcctgcgatacttattatgagcacgtcacttgat
NC_058067.1 cmsearch ncRNA 36230531 36230626 36.8 + . gene_id=Mir-2279.PRE;E-value=0.0078;sequence_with_30nt=TAAGAAACTATTACTATGGCAAAGGATCAAGTGAATACCTTCGCGCGAATCGAATTTTTGttataagaaaacttaaaccataTCTAAACGTGCAAACAGACTCGATTCGCGCGATGGTATTCACTTGAAGAATTTTTTGTGATCGTTTGTCATTAT
NC_058069.1 cmsearch ncRNA 11953990 11954047 40.4 + . gene_id=Mir-2279.PRE;E-value=0.00086;sequence_with_30nt=ggtagagtgctcgatgcgtttcgcagagcgtagtatatctgagagtaggactaatcaattgattagtcctactctcatatatatatatatatacatttttttgtgCTTTCATTTCTTG
NC_058072.1 cmsearch ncRNA 1990458 1990517 36.5 - . gene_id=Mir-2279.PRE;E-value=0.0098;sequence_with_30nt=TTTTAACTGTTTTAACTATTTTTTAGAGTATTAACCAGGCATGCGTGCGATTATGTTTAAAAACATAATCGCACGCATCCCTGGTTGATACTCTGGATTCTGCGACGAAGAGAGTCCTCA
grep: write error: Broken pipe
# Load bash variables into memory
source .bashvars
# Skip lines which begin with '#' (i.e. the header components)
echo "Predicted loci:"
grep -c "^[^#]" "${output_dir_top}/results/predictions/filtered_gff/${mirmarchine_output_prefix}.PRE.gff"
echo ""
echo "Unique miRNA families:"
grep "^[^#]" "${output_dir_top}/results/predictions/filtered_gff/${mirmarchine_output_prefix}.PRE.gff" \
| awk -F"[\t=;]" '{print $10}' \
| sort -u \
| wc -l
Predicted loci:
109
Unique miRNA families:
11