Use miRTrace [@kang2018] to identify taxonomic origins of miRNA sequencing data.
NOTE: This requires you to have previously run 08-Pmea-sRNAseq-trimming.Rmd
, as the code relies on the trimmed reads output from that code.
Inputs:
Trimmed sRNAseq FastQs generated by 08-Pmea-sRNAseq-trimming.Rmd
*flexbar_trim.25bp*.gz
Outputs:
mirtrace.config
: A miRTrace config file. A comma-separated file with this layout (one FastQ per line): /path/to/fastq,custom_sample_name
“Collapsed” (i.e. unique sequences only) FastA for each corresponding input FastQ.
mirtrace-report.html
: HTML-formatted report generated by miRTrace.
mirtrace-stats-contamination_basic.tsv
: Tab-delimited report with counts of sequences from each collapsed FastAs having matches to known miRNAs within each of the miRTrace Clades.
mirtrace-stats-contamination_detailed.csv
: Tab-delimited report of only Clades with which sequences were matched, along with the corresponding miRNA families in each clade, and the sequence counts.
This allows usage of Bash variables across R Markdown chunks.
{
echo "#### Assign Variables ####"
echo ""
echo "# Data directories"
echo 'export deep_dive_dir=/home/shared/8TB_HDD_02/shedurkin/deep-dive'
echo 'export output_dir_top=${deep_dive_dir}/F-Pmea/output/09-Pmea-sRNAseq-miRTrace'
echo 'export trimmed_reads_dir=${deep_dive_dir}/F-Pmea/output/08-Pmea-sRNAseq-trimming/trimmed-reads'
echo ""
echo "# Paths to programs"
echo 'export mirtrace=/home/sam/programs/mambaforge/envs/miRTrace_env/bin/mirtrace'
echo ""
echo "# Set number of CPUs to use"
echo 'export threads=40'
echo ""
echo "export fastq_pattern='*flexbar_trim.25bp*.gz'"
echo "# Programs associative array"
echo "declare -A programs_array"
echo "programs_array=("
echo '[mirtrace]="${mirtrace}"'
echo ")"
} > .bashvars
cat .bashvars
#### Assign Variables ####
# Data directories
export deep_dive_dir=/home/shared/8TB_HDD_02/shedurkin/deep-dive
export output_dir_top=${deep_dive_dir}/F-Pmea/output/09-Pmea-sRNAseq-miRTrace
export trimmed_reads_dir=${deep_dive_dir}/F-Pmea/output/08-Pmea-sRNAseq-trimming/trimmed-reads
# Paths to programs
export mirtrace=/home/sam/programs/mambaforge/envs/miRTrace_env/bin/mirtrace
# Set number of CPUs to use
export threads=40
export fastq_pattern='*flexbar_trim.25bp*.gz'
# Programs associative array
declare -A programs_array
programs_array=(
[mirtrace]="${mirtrace}"
)
# Load bash variables into memory
source .bashvars
# Declare array
fastq_array=()
# Populate array
fastq_array=(${trimmed_reads_dir}/${fastq_pattern})
# Loop through read pairs
# Increment by 2 to process next pair of FastQ files
if [ -f "${output_dir_top}/mirtrace.config" ]; then
echo "mirtrace.config already exists. Nothing to do."
else
for (( i=0; i<${#fastq_array[@]} ; i+=2 ))
do
# Use first three parts of filename to create short sample name
R1_name=$(echo "${fastq_array[i]##*/}" | awk -F "-" '{print $1"-"$2"-"$3}')
R2_name=$(echo "${fastq_array[i+1]##*/}" | awk -F "-" '{print $1"-"$2"-"$3}')
echo "${fastq_array[i]},${R1_name}_1"
echo "${fastq_array[i+1]},${R2_name}_2"
done >> "${output_dir_top}/mirtrace.config"
fi
cat "${output_dir_top}/mirtrace.config"
mirtrace.config already exists. Nothing to do.
/home/shared/8TB_HDD_02/shedurkin/deep-dive/F-Pmea/output/08-Pmea-sRNAseq-trimming/trimmed-reads/sRNA-POC-47-S1-TP2.flexbar_trim.25bp_1.fastq.gz,sRNA-POC-47_1
/home/shared/8TB_HDD_02/shedurkin/deep-dive/F-Pmea/output/08-Pmea-sRNAseq-trimming/trimmed-reads/sRNA-POC-47-S1-TP2.flexbar_trim.25bp_2.fastq.gz,sRNA-POC-47_2
/home/shared/8TB_HDD_02/shedurkin/deep-dive/F-Pmea/output/08-Pmea-sRNAseq-trimming/trimmed-reads/sRNA-POC-48-S1-TP2.flexbar_trim.25bp_1.fastq.gz,sRNA-POC-48_1
/home/shared/8TB_HDD_02/shedurkin/deep-dive/F-Pmea/output/08-Pmea-sRNAseq-trimming/trimmed-reads/sRNA-POC-48-S1-TP2.flexbar_trim.25bp_2.fastq.gz,sRNA-POC-48_2
/home/shared/8TB_HDD_02/shedurkin/deep-dive/F-Pmea/output/08-Pmea-sRNAseq-trimming/trimmed-reads/sRNA-POC-50-S1-TP2.flexbar_trim.25bp_1.fastq.gz,sRNA-POC-50_1
/home/shared/8TB_HDD_02/shedurkin/deep-dive/F-Pmea/output/08-Pmea-sRNAseq-trimming/trimmed-reads/sRNA-POC-50-S1-TP2.flexbar_trim.25bp_2.fastq.gz,sRNA-POC-50_2
/home/shared/8TB_HDD_02/shedurkin/deep-dive/F-Pmea/output/08-Pmea-sRNAseq-trimming/trimmed-reads/sRNA-POC-53-S1-TP2.flexbar_trim.25bp_1.fastq.gz,sRNA-POC-53_1
/home/shared/8TB_HDD_02/shedurkin/deep-dive/F-Pmea/output/08-Pmea-sRNAseq-trimming/trimmed-reads/sRNA-POC-53-S1-TP2.flexbar_trim.25bp_2.fastq.gz,sRNA-POC-53_2
/home/shared/8TB_HDD_02/shedurkin/deep-dive/F-Pmea/output/08-Pmea-sRNAseq-trimming/trimmed-reads/sRNA-POC-57-S1-TP2.flexbar_trim.25bp_1.fastq.gz,sRNA-POC-57_1
/home/shared/8TB_HDD_02/shedurkin/deep-dive/F-Pmea/output/08-Pmea-sRNAseq-trimming/trimmed-reads/sRNA-POC-57-S1-TP2.flexbar_trim.25bp_2.fastq.gz,sRNA-POC-57_2
# Load bash variables into memory
source .bashvars
time \
${programs_array[mirtrace]} trace \
--config ${output_dir_top}/mirtrace.config \
--write-fasta \
--num-threads ${threads} \
--output-dir ${output_dir_top} \
--force
tree -h ${output_dir_top}
miRTrace version 1.0.1 starting. Processing 10 sample(s).
NOTE: reusing existing output directory, outdated files may be present.
/home/shared/8TB_HDD_02/shedurkin/deep-dive/F-Pmea/output/09-Pmea-sRNAseq-miRTrace/qc_passed_reads.all.collapsed/sRNA-POC-50-S1-TP2.flexbar_trim.25bp_1.fasta (Permission denied)
I/O Error, aborting.
real 0m28.418s
user 4m19.668s
sys 0m7.212s
/home/shared/8TB_HDD_02/shedurkin/deep-dive/F-Pmea/output/09-Pmea-sRNAseq-miRTrace
├── [1.6K] mirtrace.config
├── [302K] mirtrace-report.html
├── [ 563] mirtrace-stats-contamination_basic.tsv
├── [1.1K] mirtrace-stats-contamination_detailed.tsv
└── [4.0K] qc_passed_reads.all.collapsed
├── [ 71M] sRNA-POC-47-S1-TP2.flexbar_trim.25bp_1.fasta
├── [ 89M] sRNA-POC-47-S1-TP2.flexbar_trim.25bp_2.fasta
├── [ 75M] sRNA-POC-48-S1-TP2.flexbar_trim.25bp_1.fasta
├── [ 84M] sRNA-POC-48-S1-TP2.flexbar_trim.25bp_2.fasta
├── [ 68M] sRNA-POC-50-S1-TP2.flexbar_trim.25bp_1.fasta
├── [ 83M] sRNA-POC-50-S1-TP2.flexbar_trim.25bp_2.fasta
├── [ 99M] sRNA-POC-53-S1-TP2.flexbar_trim.25bp_1.fasta
├── [116M] sRNA-POC-53-S1-TP2.flexbar_trim.25bp_2.fasta
├── [120M] sRNA-POC-57-S1-TP2.flexbar_trim.25bp_1.fasta
└── [129M] sRNA-POC-57-S1-TP2.flexbar_trim.25bp_2.fasta
1 directory, 14 files
mirtrace.detailed.df <- read.csv("../output/09-Pmea-sRNAseq-miRTrace/mirtrace-stats-contamination_detailed.tsv", sep = "\t", header = TRUE)
str(mirtrace.detailed.df)
'data.frame': 8 obs. of 14 variables:
$ CLADE : chr "nematode" "insects" "insects" "lophotrochozoa" ...
$ FAMILY_ID : int 86 14 957 1996 2689 1994 1998 3122
$ MIRBASE_IDS : chr "asu-miR-86,bma-miR-86,cbn-miR-86,cbr-miR-86,cel-miR-86,crm-miR-86,hco-miR-86,prd-miR-86,str-miR-86" "aae-miR-14,aga-miR-14,ame-miR-14,api-miR-14,bmo-miR-14,cqu-miR-14,dan-miR-14,der-miR-14,dgr-miR-14,dme-miR-14,d"| __truncated__ "aae-miR-957,aga-miR-957,cqu-miR-957,dme-miR-957,dps-miR-957,dsi-miR-957,dvi-miR-957" "cte-miR-1996a" ...
$ SEQ : chr "TAAGTGAATGCTTTGCCACA" "TCAGTCTTTTTCTCTCTCCT" "TGAAACCGTCCAAAACTGAG" "CTCAAGTGAGGTCAGTGCTG" ...
$ sRNA.POC.47_1: int 0 0 0 0 2 0 1 0
$ sRNA.POC.47_2: int 0 0 0 0 0 0 0 0
$ sRNA.POC.48_1: int 0 2 0 0 0 0 0 0
$ sRNA.POC.48_2: int 0 0 0 0 0 0 0 0
$ sRNA.POC.50_1: int 0 0 0 0 0 0 0 0
$ sRNA.POC.50_2: int 0 0 0 0 0 0 0 0
$ sRNA.POC.53_1: int 0 0 1 1 0 0 0 1
$ sRNA.POC.53_2: int 0 0 0 0 0 0 0 0
$ sRNA.POC.57_1: int 3 2 0 0 0 1 0 0
$ sRNA.POC.57_2: int 0 0 0 0 0 0 0 0
# Select columns corresponding to sample names
sample_columns <- mirtrace.detailed.df %>%
select(starts_with("sRNA.POC."))
# Calculate the sum for each column
sample_sums <- colSums(sample_columns)
# Count the number of columns with a sum greater than 0
samples_with_sum_gt_0 <- sum(sample_sums > 0)
paste("Number of samples with matches: ", samples_with_sum_gt_0)
[1] "Number of samples with matches: 4"
# Total number of samples (columns)
total_samples <- ncol(sample_columns)
# Percentage of samples with sums greater than 0
percentage_samples_gt_0 <- (samples_with_sum_gt_0 / total_samples) * 100
paste("Percentage of samples with matches: ", percentage_samples_gt_0)
[1] "Percentage of samples with matches: 40"
unique_clade_count <- mirtrace.detailed.df %>%
distinct(CLADE) %>% # Get unique entries in CLADE column
count() # Count the number of unique entries
paste("Number of clades with matches:", unique_clade_count)
[1] "Number of clades with matches: 4"
To make them easier to see, counts > 0 are highlighted in green.
mirtrace.detailed.df %>%
mutate(
across(
starts_with("sRNA"),
~cell_spec(
.,
background = ifelse(
. > 0,
"lightgreen",
"white"
)
)
)
) %>%
kable(escape = F, caption = "Clades identified as having sRNAseq matches.") %>%
kable_styling("striped") %>%
scroll_box(width = "100%", height = "500px")
CLADE | FAMILY_ID | MIRBASE_IDS | SEQ | sRNA.POC.47_1 | sRNA.POC.47_2 | sRNA.POC.48_1 | sRNA.POC.48_2 | sRNA.POC.50_1 | sRNA.POC.50_2 | sRNA.POC.53_1 | sRNA.POC.53_2 | sRNA.POC.57_1 | sRNA.POC.57_2 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
nematode | 86 | asu-miR-86,bma-miR-86,cbn-miR-86,cbr-miR-86,cel-miR-86,crm-miR-86,hco-miR-86,prd-miR-86,str-miR-86 | TAAGTGAATGCTTTGCCACA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 |
insects | 14 | aae-miR-14,aga-miR-14,ame-miR-14,api-miR-14,bmo-miR-14,cqu-miR-14,dan-miR-14,der-miR-14,dgr-miR-14,dme-miR-14,dmo-miR-14,dpe-miR-14,dps-miR-14,dse-miR-14,dsi-miR-14,dvi-miR-14,dwi-miR-14,dya-miR-14,hme-miR-14,mse-miR-14,ngi-miR-14,nvi-miR-14,tca-miR-14 | TCAGTCTTTTTCTCTCTCCT | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2 | 0 |
insects | 957 | aae-miR-957,aga-miR-957,cqu-miR-957,dme-miR-957,dps-miR-957,dsi-miR-957,dvi-miR-957 | TGAAACCGTCCAAAACTGAG | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
lophotrochozoa | 1996 | cte-miR-1996a | CTCAAGTGAGGTCAGTGCTG | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
lophotrochozoa | 2689 | cte-miR-2689 | TATCCTGGCCTGCAAGTGCA | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
lophotrochozoa | 1994 | cla-miR-1994,cte-miR-1994,lgi-miR-1994a,lgi-miR-1994b | TGAGACAGTGTGTCCTCCCT | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
lophotrochozoa | 1998 | cte-miR-1998 | TTGAACGCAGAGATGTACAT | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
primates | 3122 | hsa-miR-3122 | GTTGGGACAAGAGGACGGTC | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |