Code for trimming and QCing RNAseq data, to be used on Acropora pulchra.
For now I’m just going to QC the raw reads and the trimmed reads generated as a part of the deep-dive project. If additional/different trimming needs to be done for this expression work, it will be performed here.
Inputs:
*.fastq.gz
)Outputs:
FastQC
HTML reports for raw and trimmed reads.
Sequencing report Trimming details
This allows usage of Bash variables (e.g. paths to common directories) across R Markdown chunks.
{
echo "#### Assign Variables ####"
echo ""
echo "# Data directories"
echo 'export expression_dir=/home/shared/8TB_HDD_02/shedurkin/expression'
echo 'export output_dir_top=${expression_dir}/D-Apul/output/01-Apul-RNA-trimming-FastQC'
echo 'export raw_fastqc_dir=${output_dir_top}/raw-fastqc'
echo 'export raw_reads_dir=${expression_dir}/D-Apul/data/01-Apul-RNA-trimming-FastQC/raw-reads'
echo 'export raw_reads_url="https://owl.fish.washington.edu/nightingales/A_pulchra/30-789513166/"'
echo 'export trimmed_fastqc_dir=${output_dir_top}/trimmed-fastqc'
echo 'export trimmed_reads_dir=${expression_dir}/D-Apul/data/01-Apul-RNA-trimming-FastQC/trimmed-reads'
echo 'export trimmed_reads_url="https://gannet.fish.washington.edu/Atumefaciens/20230519-E5_coral-fastqc-fastp-multiqc-RNAseq/A_pulchra/trimmed/"'
echo ""
echo "# Paths to programs"
echo 'export fastqc=/home/shared/FastQC-0.12.1/fastqc'
echo 'export multiqc=/home/sam/programs/mambaforge/bin/multiqc'
echo 'export flexbar=/home/shared/flexbar-3.5.0-linux/flexbar'
echo ""
echo "# Set FastQ filename patterns"
echo "export fastq_pattern='*.fastq.gz'"
echo "export R1_fastq_pattern='*_R1_*.fastq.gz'"
echo "export R2_fastq_pattern='*_R2_*.fastq.gz'"
echo ""
echo "# Set number of CPUs to use"
echo 'export threads=20'
echo ""
echo "# Input/output files"
echo 'export raw_checksums=checksums.md5'
echo 'export trimmed_checksums=trimmed_fastq_checksums.md5'
echo ""
echo "## Inititalize arrays"
echo 'export fastq_array_R1=()'
echo 'export fastq_array_R2=()'
echo 'export raw_fastqs_array=()'
echo 'export R1_names_array=()'
echo 'export R2_names_array=()'
echo 'export trimmed_fastqs_array=()'
echo ""
echo "# Programs associative array"
echo "declare -A programs_array"
echo "programs_array=("
echo '[fastqc]="${fastqc}" \'
echo '[multiqc]="${multiqc}" \'
echo '[flexbar]="${flexbar}"'
echo ")"
} > .bashvars
cat .bashvars
#### Assign Variables ####
# Data directories
export expression_dir=/home/shared/8TB_HDD_02/shedurkin/expression
export output_dir_top=${expression_dir}/D-Apul/output/01-Apul-RNA-trimming-FastQC
export raw_fastqc_dir=${output_dir_top}/raw-fastqc
export raw_reads_dir=${expression_dir}/D-Apul/data/01-Apul-RNA-trimming-FastQC/raw-reads
export raw_reads_url="https://owl.fish.washington.edu/nightingales/A_pulchra/30-789513166/"
export trimmed_fastqc_dir=${output_dir_top}/trimmed-fastqc
export trimmed_reads_dir=${expression_dir}/D-Apul/data/01-Apul-RNA-trimming-FastQC/trimmed-reads
export trimmed_reads_url="https://gannet.fish.washington.edu/Atumefaciens/20230519-E5_coral-fastqc-fastp-multiqc-RNAseq/A_pulchra/trimmed/"
# Paths to programs
export fastqc=/home/shared/FastQC-0.12.1/fastqc
export multiqc=/home/sam/programs/mambaforge/bin/multiqc
export flexbar=/home/shared/flexbar-3.5.0-linux/flexbar
# Set FastQ filename patterns
export fastq_pattern='*.fastq.gz'
export R1_fastq_pattern='*_R1_*.fastq.gz'
export R2_fastq_pattern='*_R2_*.fastq.gz'
# Set number of CPUs to use
export threads=20
# Input/output files
export raw_checksums=checksums.md5
export trimmed_checksums=trimmed_fastq_checksums.md5
## Inititalize arrays
export fastq_array_R1=()
export fastq_array_R2=()
export raw_fastqs_array=()
export R1_names_array=()
export R2_names_array=()
export trimmed_fastqs_array=()
# Programs associative array
declare -A programs_array
programs_array=(
[fastqc]="${fastqc}" \
[multiqc]="${multiqc}" \
[flexbar]="${flexbar}"
)
Reads are downloaded from: https://owl.fish.washington.edu/nightingales/A_pulchra/30-789513166/
The --cut-dirs 3
command cuts the preceding directory structure (i.e. nightingales/A_pulchra/30-789513166/
) so that we just end up with
the reads.
# Load bash variables into memory
source .bashvars
wget \
--directory-prefix ${raw_reads_dir} \
--recursive \
--no-check-certificate \
--continue \
--cut-dirs 3 \
--no-host-directories \
--no-parent \
--quiet \
--accept ${fastq_pattern} ${raw_reads_url}
# Load bash variables into memory
source .bashvars
ls -lh "${raw_reads_dir}"
total 66G
-rw-r--r-- 1 shedurkin labmembers 3.5G May 16 2023 ACR-140-S1-TP2_R1_001.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 3.6G May 16 2023 ACR-140-S1-TP2_R2_001.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 3.1G May 16 2023 ACR-145-S1-TP2_R1_001.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 3.2G May 16 2023 ACR-145-S1-TP2_R2_001.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 3.2G May 16 2023 ACR-150-S1-TP2_R1_001.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 3.3G May 16 2023 ACR-150-S1-TP2_R2_001.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 3.5G May 16 2023 ACR-173-S1-TP2_R1_001.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 3.6G May 16 2023 ACR-173-S1-TP2_R2_001.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 3.1G May 16 2023 ACR-178-S1-TP2_R1_001.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 3.2G May 16 2023 ACR-178-S1-TP2_R2_001.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 1.4K May 17 2023 checksums.md5
-rw-r--r-- 1 shedurkin labmembers 3.5G May 17 2023 RNA-ACR-140-S1-TP2_R1_001.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 3.6G May 17 2023 RNA-ACR-140-S1-TP2_R2_001.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 3.1G May 17 2023 RNA-ACR-145-S1-TP2_R1_001.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 3.2G May 17 2023 RNA-ACR-145-S1-TP2_R2_001.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 3.2G May 17 2023 RNA-ACR-150-S1-TP2_R1_001.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 3.3G May 17 2023 RNA-ACR-150-S1-TP2_R2_001.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 3.5G May 17 2023 RNA-ACR-173-S1-TP2_R1_001.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 3.6G May 17 2023 RNA-ACR-173-S1-TP2_R2_001.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 3.1G May 17 2023 RNA-ACR-178-S1-TP2_R1_001.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 3.2G May 17 2023 RNA-ACR-178-S1-TP2_R2_001.fastq.gz
# Load bash variables into memory
source .bashvars
wget \
--directory-prefix ${raw_reads_dir} \
--recursive \
--no-check-certificate \
--continue \
--cut-dirs 3 \
--no-host-directories \
--no-parent \
--quiet \
--accept checksums.md5 ${raw_reads_url}
cd "${raw_reads_dir}"
md5sum checksums.md5 --check
# Load bash variables into memory
source .bashvars
############ RUN FASTQC ############
# Create array of raw FastQs
raw_fastqs_array=(${raw_reads_dir}/${fastq_pattern})
# Pass array contents to new variable as space-delimited list
raw_fastqc_list=$(echo "${raw_fastqs_array[*]}")
echo "Beginning FastQC on raw reads..."
echo ""
# Run FastQC
### NOTE: Do NOT quote raw_fastqc_list
${programs_array[fastqc]} \
--threads ${threads} \
--outdir ${raw_fastqc_dir} \
--quiet \
${raw_fastqc_list}
echo "FastQC on raw reads complete!"
echo ""
############ END FASTQC ############
############ RUN MULTIQC ############
echo "Beginning MultiQC on raw FastQC..."
echo ""
${programs_array[multiqc]} ${raw_fastqc_dir} -o ${raw_fastqc_dir}
echo ""
echo "MultiQC on raw FastQs complete."
echo ""
############ END MULTIQC ############
echo "Removing FastQC zip files."
echo ""
rm ${raw_fastqc_dir}/*.zip
echo "FastQC zip files removed."
echo ""
# Load bash variables into memory
source .bashvars
# View directory contents
ls -lh ${raw_fastqc_dir}
total 13M
-rw-r--r-- 1 shedurkin labmembers 593K Aug 19 15:17 ACR-140-S1-TP2_R1_001_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 593K Aug 19 15:17 ACR-140-S1-TP2_R2_001_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 595K Aug 19 15:16 ACR-145-S1-TP2_R1_001_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 594K Aug 19 15:16 ACR-145-S1-TP2_R2_001_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 597K Aug 19 15:16 ACR-150-S1-TP2_R1_001_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 597K Aug 19 15:16 ACR-150-S1-TP2_R2_001_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 597K Aug 19 15:17 ACR-173-S1-TP2_R1_001_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 596K Aug 19 15:17 ACR-173-S1-TP2_R2_001_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 592K Aug 19 15:16 ACR-178-S1-TP2_R1_001_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 598K Aug 19 15:16 ACR-178-S1-TP2_R2_001_fastqc.html
drwxr-xr-x 2 shedurkin labmembers 4.0K Aug 19 15:17 multiqc_data
-rw-r--r-- 1 shedurkin labmembers 1.4M Aug 19 15:17 multiqc_report.html
-rw-r--r-- 1 shedurkin labmembers 593K Aug 19 15:17 RNA-ACR-140-S1-TP2_R1_001_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 593K Aug 19 15:17 RNA-ACR-140-S1-TP2_R2_001_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 595K Aug 19 15:16 RNA-ACR-145-S1-TP2_R1_001_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 594K Aug 19 15:16 RNA-ACR-145-S1-TP2_R2_001_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 597K Aug 19 15:16 RNA-ACR-150-S1-TP2_R1_001_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 597K Aug 19 15:17 RNA-ACR-150-S1-TP2_R2_001_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 597K Aug 19 15:17 RNA-ACR-173-S1-TP2_R1_001_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 596K Aug 19 15:17 RNA-ACR-173-S1-TP2_R2_001_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 592K Aug 19 15:16 RNA-ACR-178-S1-TP2_R1_001_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 598K Aug 19 15:16 RNA-ACR-178-S1-TP2_R2_001_fastqc.html
Reads are downloaded from: https://gannet.fish.washington.edu/Atumefaciens/20230519-E5_coral-fastqc-fastp-multiqc-RNAseq/A_pulchra/trimmed/
# Load bash variables into memory
source .bashvars
wget \
--directory-prefix ${trimmed_reads_dir} \
--recursive \
--no-check-certificate \
--continue \
--cut-dirs 4 \
--no-host-directories \
--no-parent \
--quiet \
--accept ${fastq_pattern} ${trimmed_reads_url}
# Load bash variables into memory
source .bashvars
ls -lh "${trimmed_reads_dir}"
total 27G
-rw-r--r-- 1 shedurkin labmembers 2.9G May 19 2023 RNA-ACR-140-S1-TP2_R1_001.fastp-trim.20230519.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 3.0G May 19 2023 RNA-ACR-140-S1-TP2_R2_001.fastp-trim.20230519.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 2.6G May 19 2023 RNA-ACR-145-S1-TP2_R1_001.fastp-trim.20230519.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 2.6G May 19 2023 RNA-ACR-145-S1-TP2_R2_001.fastp-trim.20230519.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 2.6G May 19 2023 RNA-ACR-150-S1-TP2_R1_001.fastp-trim.20230519.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 2.7G May 19 2023 RNA-ACR-150-S1-TP2_R2_001.fastp-trim.20230519.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 2.9G May 19 2023 RNA-ACR-173-S1-TP2_R1_001.fastp-trim.20230519.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 3.0G May 19 2023 RNA-ACR-173-S1-TP2_R2_001.fastp-trim.20230519.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 2.6G May 19 2023 RNA-ACR-178-S1-TP2_R1_001.fastp-trim.20230519.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 2.7G May 19 2023 RNA-ACR-178-S1-TP2_R2_001.fastp-trim.20230519.fastq.gz
-rw-r--r-- 1 shedurkin labmembers 890 May 19 2023 trimmed_fastq_checksums.md5
# Load bash variables into memory
source .bashvars
wget \
--directory-prefix ${trimmed_reads_dir} \
--recursive \
--no-check-certificate \
--continue \
--cut-dirs 4 \
--no-host-directories \
--no-parent \
--quiet \
--accept trimmed_fastq_checksums.md5 ${trimmed_reads_url}
cd "${trimmed_reads_dir}"
md5sum trimmed_fastq_checksums.md5 --check
# Load bash variables into memory
source .bashvars
############ RUN FASTQC ############
### NOTE: Do NOT quote raw_fastqc_list
# Create array of trimmed FastQs
trimmed_fastqs_array=(${trimmed_reads_dir}/${fastq_pattern})
# Pass array contents to new variable as space-delimited list
trimmed_fastqc_list=$(echo "${trimmed_fastqs_array[*]}")
echo "Beginning FastQC on raw reads..."
echo ""
# Run FastQC
${programs_array[fastqc]} \
--threads ${threads} \
--outdir ${trimmed_fastqc_dir} \
--quiet \
${trimmed_fastqc_list}
echo "FastQC on trimmed reads complete!"
echo ""
############ END FASTQC ############
############ RUN MULTIQC ############
echo "Beginning MultiQC on raw FastQC..."
echo ""
${programs_array[multiqc]} ${trimmed_fastqc_dir} -o ${trimmed_fastqc_dir}
echo ""
echo "MultiQC on trimmed FastQs complete."
echo ""
############ END MULTIQC ############
echo "Removing FastQC zip files."
echo ""
rm ${trimmed_fastqc_dir}/*.zip
echo "FastQC zip files removed."
echo ""
# Load bash variables into memory
source .bashvars
# View directory contents
ls -lh ${trimmed_fastqc_dir}
total 7.9M
drwxr-xr-x 2 shedurkin labmembers 4.0K Aug 19 16:50 multiqc_data
-rw-r--r-- 1 shedurkin labmembers 1.3M Aug 19 16:50 multiqc_report.html
-rw-r--r-- 1 shedurkin labmembers 666K Aug 19 16:50 RNA-ACR-140-S1-TP2_R1_001.fastp-trim.20230519_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 671K Aug 19 16:50 RNA-ACR-140-S1-TP2_R2_001.fastp-trim.20230519_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 671K Aug 19 16:50 RNA-ACR-145-S1-TP2_R1_001.fastp-trim.20230519_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 668K Aug 19 16:50 RNA-ACR-145-S1-TP2_R2_001.fastp-trim.20230519_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 676K Aug 19 16:50 RNA-ACR-150-S1-TP2_R1_001.fastp-trim.20230519_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 674K Aug 19 16:50 RNA-ACR-150-S1-TP2_R2_001.fastp-trim.20230519_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 671K Aug 19 16:50 RNA-ACR-173-S1-TP2_R1_001.fastp-trim.20230519_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 669K Aug 19 16:50 RNA-ACR-173-S1-TP2_R2_001.fastp-trim.20230519_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 669K Aug 19 16:50 RNA-ACR-178-S1-TP2_R1_001.fastp-trim.20230519_fastqc.html
-rw-r--r-- 1 shedurkin labmembers 675K Aug 19 16:50 RNA-ACR-178-S1-TP2_R2_001.fastp-trim.20230519_fastqc.html