--- title: "11.2-Apul-sRNAseq-miRdeep2-31bp-fastp-merged-cnidarian_only" author: "Sam White" date: "2024-05-16" output: github_document: toc: true number_sections: true bookdown::html_document2: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true html_document: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true bibliography: references.bib link-citations: true --- Use [miRDeep2](https://github.com/rajewsky-lab/mirdeep2) [@friedländer2011] to identify potential miRNAs using _A.pulchra_ sRNAseq reads. The *A.millepora* genome will be used as the reference genome for *A.pulchra*, as *A.pulchra* does not currently have a sequenced genome and *A.millepora* had highest alignment rates for standard RNAseq data compared to other published genomes tested. ------------------------------------------------------------------------ Inputs: - Trimmed and merged reads. See [08.2-Apul-sRNAseq-trimming-31bp-fastp-merged.Rmd](./08.2-Apul-sRNAseq-trimming-31bp-fastp-merged.Rmd) for code. - *A.millepora* genome FastA. See [12-Apul-sRNAseq-MirMachine.Rmd](https://github.com/urol-e5/deep-dive/blob/main/D-Apul/code/12-Apul-sRNAseq-MirMachine.Rmd) for download info if needed. - [Cnidarian-only miRNA FastA](../../data/cnidarian_miRNAs.fasta) - Cnidarian miRNA culled from literature by Jill Ashey. Outputs: - Primary outputs are a result table in BED, CSV (tab-delimited), and HTML formats. - Due to the nature of mirDeep2's naming, trying to use variable names is challenging. As such, the chunks processing those files will require manual intervention to identify and provide the output filename(s); they are _not_ handled at in the `.bashvars` file at the top of this script. - Other output files are too large for GitHub (and some (all?) are not needed for this analysis). Please find a full backup here: https://gannet.fish.washington.edu/Atumefaciens/gitrepos/deep-dive/D-Apul/output/11.2-Apul-sRNAseq-miRdeep2-31bp-fastp-merged-cnidarian_only/ ------------------------------------------------------------------------ ```{r setup, include=FALSE} library(knitr) library(kableExtra) library(dplyr) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages comment = "", # Prevents appending '##' to beginning of lines in code output width = 1000 # adds scroll bar ) ``` # Create a Bash variables file This allows usage of Bash variables across R Markdown chunks. ```{r save-bash-variables-to-rvars-file, engine='bash', eval=TRUE} { echo "#### Assign Variables ####" echo "" echo "# Trimmed FastQ naming pattern" echo "export trimmed_fastqs_pattern='*fastp-adapters-polyG-31bp-merged.fq.gz'" echo "" echo "# Data directories" echo 'export deep_dive_dir=/home/shared/8TB_HDD_01/sam/gitrepos/deep-dive' echo 'export deep_dive_data_dir="${deep_dive_dir}/data"' echo 'export output_dir_top=${deep_dive_dir}/D-Apul/output/11.2-Apul-sRNAseq-miRdeep2-31bp-fastp-merged-cnidarian_only' echo 'export genome_fasta_dir=${deep_dive_dir}/D-Apul/data/Amil/ncbi_dataset/data/GCF_013753865.1' echo 'export trimmed_fastqs_dir="${deep_dive_dir}/D-Apul/output/08.2-Apul-sRNAseq-trimming-31bp-fastp-merged/trimmed-reads"' echo "" echo "# Input/Output files" echo 'export collapsed_reads_gt17bp_mirdeep2="collapsed_reads_gt17bp_mirdeep2.fasta"' echo 'export concatenated_trimmed_reads_fasta="concatenated-trimmed-reads-all.fasta"' echo 'export concatenated_trimmed_reads_fasta_no_spaces="concatenated-trimmed-reads-all-no_spaces.fasta"' echo 'export genome_fasta_name="GCF_013753865.1_Amil_v2.1_genomic.fna"' echo 'export genome_fasta_no_spaces="GCF_013753865.1_Amil_v2.1_genomic-no_spaces.fna"' echo 'export mirdeep2_mapping_file="Apul-mirdeep2-mapping.arf"' echo 'export cnidarian_mirna_fasta_name="cnidarian_miRNAs.fasta"' echo "" echo "# Paths to programs" echo 'export mirdeep2_mapper="mapper.pl"' echo 'export mirdeep2="miRDeep2.pl"' echo 'export mirdeep2_fastaparse="fastaparse.pl"' echo 'export bowtie_build="/home/shared/bowtie-1.3.1-linux-x86_64/bowtie-build"' echo 'export seqtk="/home/shared/seqtk-1.4/seqtk"' echo "" echo "# Set number of CPUs to use" echo 'export threads=40' echo "" } > .bashvars cat .bashvars ``` # Prepare reads for miRDeep2 ## Convert FastQ to FastA ```{r covert-fastq-to-fasta, eval=TRUE, engine='bash'} # Load bash variables into memory source .bashvars # Make output directory, if it doesn't exist mkdir --parents "${output_dir_top}" if [ ! -f "${output_dir_top}/${concatenated_trimmed_reads_fasta}" ]; then for fastq in ${trimmed_fastqs_dir}/${trimmed_fastqs_pattern} do ${seqtk} seq -A ${fastq} >> ${output_dir_top}/${concatenated_trimmed_reads_fasta} done fi head ${output_dir_top}/${concatenated_trimmed_reads_fasta} ``` ## Remove Spaces in Reads FastA ```{r remove-spaces-reads-FastA, eval=TRUE, engine='bash'} # Load bash variables into memory source .bashvars if [ ! -f "${output_dir_top}/${concatenated_trimmed_reads_fasta_no_spaces}" ]; then sed '/^>/ s/ /_/g' "${output_dir_top}/${concatenated_trimmed_reads_fasta}" \ > "${output_dir_top}/${concatenated_trimmed_reads_fasta_no_spaces}" fi grep "^>" ${output_dir_top}/${concatenated_trimmed_reads_fasta} \ | head echo "" echo "--------------------------------------------------" echo "" grep "^>" ${output_dir_top}/${concatenated_trimmed_reads_fasta_no_spaces} \ | head ``` # Reformat genome FastA description lines miRDeep2 can't process genome FastAs with spaces in the description lines. So, I'm replacing spaces with underscores. And, for aesthetics, I'm also removing commas. ```{r remove-spaces-genome-FastA, eval=TRUE, engine='bash'} # Load bash variables into memory source .bashvars sed '/^>/ s/ /_/g' "${genome_fasta_dir}/${genome_fasta_name}" \ | sed '/^>/ s/,//g' \ > "${genome_fasta_dir}/${genome_fasta_no_spaces}" grep "^>" ${genome_fasta_dir}/${genome_fasta_name} \ | head echo "" echo "--------------------------------------------------" echo "" grep "^>" ${genome_fasta_dir}/${genome_fasta_no_spaces} \ | head ``` # Bowtie v1 genome index miRDeep2 requires a Bowtie v1 genome index - cannot use Bowtie2 index ```{r bowtie1-index, eval=FALSE, engine='bash'} # Load bash variables into memory source .bashvars # Check for existence of genome index first if [ ! -f "${genome_fasta_no_spaces%.*}.1.ebwt" ]; then ${bowtie_build} \ ${genome_fasta_dir}/${genome_fasta_no_spaces} \ ${genome_fasta_dir}/${genome_fasta_no_spaces%.*} \ --threads ${threads} \ --quiet fi ``` # Map reads to genome Requires genome to be previously indexed with Bowtie. Additionally, requires user to enter path to their `mirdeep2` directory as well as their `perl5` installation. ```{r map-read-genome, eval=FALSE, cache=TRUE, engine='bash'} # Load bash variables into memory source .bashvars # Move to output directory # miRDeep2 puts output files in current working directory cd ${output_dir_top} # Append miRDeep2 to system PATH and set PERL5LIB export PATH=$PATH:/home/shared/mirdeep2/bin export PERL5LIB=$PERL5LIB:/home/shared/mirdeep2/lib/perl5 # Run miRDeep2 mapping time \ ${mirdeep2_mapper} \ ${concatenated_trimmed_reads_fasta_no_spaces} \ -c \ -m \ -l 18 \ -p ${genome_fasta_dir}/${genome_fasta_no_spaces%.*} \ -s ${collapsed_reads_gt17bp_mirdeep2} \ -t ${mirdeep2_mapping_file} \ -o ${threads} ``` # Run miRDeep2 Recommendation is to use the closest related species in the miRDeep2 options, even if the species isn't very closely related. The documentation indicates that miRDeep2 is always more accurate when at least a species is provided. The options provided to the command are as follows: - `none`: Known miRNAs of the species being analyzed. - `cnidarian_mirna_fasta_name`: Related species miRNAs. - `none`: Known miRNA precursors in this species. - `-P`: Specifies miRBase version > 18. - `-v`: Remove temporary files after completion. - `-g -1`: Number of precursors to analyze. A setting of `-1` will analyze all. Default is 50,000 I set this to `-1` after multiple attempts to run using the default kept failing. NOTE: This will take an _extremely_ long time to run (days). Could possibly be shortened by excluding `randfold` analysis. ```{r mirdeep2, eval=FALSE, cache=TRUE, engine='bash'} # Load bash variables into memory source .bashvars # Move to output directory # miRDeep2 puts output files in current working directory cd ${output_dir_top} # Append miRDeep2 to system PATH and set PERL5LIB export PATH=$PATH:/home/shared/mirdeep2/bin export PERL5LIB=$PERL5LIB:/home/shared/mirdeep2/lib/perl5 time \ ${mirdeep2} \ ${collapsed_reads_gt17bp_mirdeep2} \ ${genome_fasta_dir}/${genome_fasta_no_spaces} \ ${mirdeep2_mapping_file} \ none \ ${deep_dive_data_dir}/${cnidarian_mirna_fasta_name} \ none \ -P \ -v \ -g -1 \ 2>miRDeep2-report.log ``` ## Check runtime ```{r runtime-mirdeep2, eval=TRUE, engine='bash'} # Load bash variables into memory source .bashvars tail -n 6 ${output_dir_top}/miRDeep2-report.log ``` # Results REMEMBER - You'll need to set the name of your results file before running the chunks below. ## Peep output file format The formatting of this CSV is terrible. It has a 3-column table on top of a 17-column table. This makes parsing a bit of a pain in its raw format. ```{r check-results-file, eval=TRUE, engine='bash'} # Load bash variables into memory source .bashvars head -n 30 "${output_dir_top}/result_16_05_2024_t_11_15_00.csv" ``` ## Create more easily parasable results file This will match the line beginning with `provisional id` and print to the end of the file (represented by the `$p`. `$` = end, `p` = print) ```{r parsable-results-file, eval=TRUE, engine='bash'} # Load bash variables into memory source .bashvars sed --quiet '/provisional id/,$p' "${output_dir_top}/result_16_05_2024_t_11_15_00.csv" \ > "${output_dir_top}/parsable-result_16_05_2024_t_11_15_00.csv" head "${output_dir_top}/parsable-result_16_05_2024_t_11_15_00.csv" ``` ## Read in output CSV This chunk provides a more concise overview of the data and it's columns. ```{r read-in-result-csv, eval=TRUE} mirdeep_result.df <- read.csv("../output/11.2-Apul-sRNAseq-miRdeep2-31bp-fastp-merged-cnidarian_only/parsable-result_16_05_2024_t_11_15_00.csv", header = TRUE, sep = "\t") str(mirdeep_result.df) ``` ## miRNAs count data This provides some rudimentary numbers for the miRDeep2 output. Further analysis is possibly desired to evaluate score thresholds, miRNA families, etc. ```{r results-counts, eval=TRUE, engine='bash'} # Load bash variables into memory source .bashvars # Total predicted miRNAS total_miRNAs=$(awk 'NR > 1' ${output_dir_top}/parsable-result_16_05_2024_t_11_15_00.csv \ | wc -l ) echo "Total of predicted miRNAs: ${total_miRNAs}" echo "" # Matches to known mature miRNAs mature_miRNAs=$(awk -F'\t' '$11 != "-" && $11 != "" {print $11}' ${output_dir_top}/parsable-result_16_05_2024_t_11_15_00.csv \ | wc -l ) echo "Number of seed matches to known miRNAS: ${mature_miRNAs}" echo "" # Novel miRNAs novel_miRNAs=$(awk -F "\t" '$11 == "-" || $11 == "" {print $11}' ${output_dir_top}/parsable-result_16_05_2024_t_11_15_00.csv \ | awk 'NR > 1' \ | wc -l ) echo "Number of novel miRNAs: ${novel_miRNAs}" ``` ------------------------------------------------------------------------ # Citations