Began transcriptome annotation (GitHub issue) using TransDecoder on the de novo transcriptome assembly provide by Giles Goetz (see link to issue). TransDecoder uses a combination of BLASTp hmmscan and pfam to identify the longest potential open reading frames (ORFs) from the hundreds of thousands of contigs assembled by Trinity. The job was run on Mox.

SLURM script

20231107-mmag-transdecoder-transcriptome.sh (GitHub)

#!/bin/bash
## Job Name
#SBATCH --job-name=20231107-mmag-transdecoder-transcriptome
## Allocation Definition
#SBATCH --account=srlab
#SBATCH --partition=srlab
## Resources
## Nodes
#SBATCH --nodes=1
## Walltime (days-hours:minutes:seconds format)
#SBATCH --time=05-00:00:00
## Memory per node
#SBATCH --mem=200G
##turn on e-mail notification
#SBATCH --mail-type=ALL
#SBATCH --mail-user=samwhite@uw.edu
## Specify the working directory for this job
#SBATCH --chdir=/gscratch/scrubbed/samwhite/outputs/20231107-mmag-transdecoder-transcriptome

# Transdecoder to identify ORFs DuMOAR Trinity assembly

###################################################################################
# These variables need to be set by user

## Assign Variables

threads=28


# Paths to input/output files

trinity_fasta="/gscratch/srlab/sam/data/C_magister/transcriptomes/trinity_denovo.Trinity.fasta"
trinity_gene_trans_map="/gscratch/srlab/sam/data/C_magister/transcriptomes/trinity_denovo.Trinity.fasta.gene_trans_map"

blastp_out_dir="blastp_out"
transdecoder_out_dir="${trinity_fasta##*/}.transdecoder_dir"
pfam_out_dir="pfam_out"
blastp_out="${blastp_out_dir}/blastp.outfmt6"

pfam_out="${pfam_out_dir}/pfam.domtblout"
lORFs_pep="${transdecoder_out_dir}/longest_orfs.pep"
pfam_db="/gscratch/srlab/programs/Trinotate-v3.1.1/admin/Pfam-A.hmm"
sp_db="/gscratch/srlab/programs/Trinotate-v3.1.1/admin/uniprot_sprot.pep"

# Paths to programs
blast_dir="/gscratch/srlab/programs/ncbi-blast-2.8.1+/bin"
blastp="${blast_dir}/blastp"
hmmer_dir="/gscratch/srlab/programs/hmmer-3.2.1/src"
hmmscan="${hmmer_dir}/hmmscan"
transdecoder_dir="/gscratch/srlab/programs/TransDecoder-v5.5.0"
transdecoder_lORFs="${transdecoder_dir}/TransDecoder.LongOrfs"
transdecoder_predict="${transdecoder_dir}/TransDecoder.Predict"
###################################################################################


# Load Python Mox module for Python module availability
module load intel-python3_2017

# Capture input FastA checksum
echo "Input Fasta: ${trinity_fasta}"
echo "MD5 checksum:"
echo ""
md5sum "${trinity_fasta}" | tee --append md5checksum-assembly.txt
echo ""

# Make output directories
mkdir --parents "${blastp_out_dir}"
mkdir --parents "${pfam_out_dir}"

# Extract long open reading frames
${transdecoder_lORFs} \
-t ${trinity_fasta} \
--gene_trans_map ${trinity_gene_trans_map}

# Run blastp on long ORFs
${blastp} \
-query ${lORFs_pep} \
-db ${sp_db} \
-max_target_seqs 1 \
-outfmt 6 \
-evalue 1e-5 \
-num_threads ${threads} \
> ${blastp_out}

# Run pfam search
${hmmscan} \
--cpu ${threads} \
--domtblout ${pfam_out} \
${pfam_db} \
${lORFs_pep}

# Run Transdecoder with blastp and Pfam results
${transdecoder_predict} \
-t ${trinity_fasta} \
--retain_pfam_hits ${pfam_out} \
--retain_blastp_hits ${blastp_out}

####################################################################

# Capture program options
if [[ "${#programs_array[@]}" -gt 0 ]]; then
  echo "Logging program options..."
  for program in "${!programs_array[@]}"
  do
    {
    echo "Program options for ${program}: "
    echo ""
    # Handle samtools help menus
    if [[ "${program}" == "samtools_index" ]] \
    || [[ "${program}" == "samtools_sort" ]] \
    || [[ "${program}" == "samtools_view" ]]
    then
      ${programs_array[$program]}

    # Handle DIAMOND BLAST menu
    elif [[ "${program}" == "diamond" ]]; then
      ${programs_array[$program]} help

    # Handle NCBI BLASTx menu
    elif [[ "${program}" == "blastx" ]]; then
      ${programs_array[$program]} -help

    # Handle StringTie prepDE script
    elif [[ "${program}" == "prepDE" ]]; then
      python3 ${programs_array[$program]} -h
    fi
    ${programs_array[$program]} -h
    echo ""
    echo ""
    echo "----------------------------------------------"
    echo ""
    echo ""
  } &>> program_options.log || true

    # If MultiQC is in programs_array, copy the config file to this directory.
    if [[ "${program}" == "multiqc" ]]; then
      cp --preserve ~/.multiqc_config.yaml multiqc_config.yaml
    fi
  done
  echo "Finished logging programs options."
  echo ""
fi


# Document programs in PATH (primarily for program version ID)
echo "Logging system $PATH..."
{
date
echo ""
echo "System PATH for $SLURM_JOB_ID"
echo ""
printf "%0.s-" {1..10}
echo "${PATH}" | tr : \\n
} >> system_path.log
echo "Finished logging system $PATH."

/———-

RESULTS

Runtime

Runtime was actually longer than I anticipated: ~1.25 days.

Files

Although there are other files generated by TransDecoder, such as BEDs and GFFs, the following files are those utilized downstream by Trinotate to complete the transcriptome annotation pipeline.

Output folder:

20231107-mmag-transdecoder-transcriptome/
FastA checksum (TXT):
- md5checksum-assembly.txt
BLASTp output format 6 (TXT):
- 20231107-mmag-transdecoder-transcriptome/blastp_out/blastp.outfmt6 (6.5MB)
Longest peptide ORFs (FastA):
- 20231107-mmag-transdecoder-transcriptome/trinity_denovo.Trinity.fasta.transdecoder_dir/longest_orfs.pep (57MB)
Pfam output (TXT):
- 20231107-mmag-transdecoder-transcriptome/pfam_out/pfam.domtblout (143MB)