This Rmd file will download raw RNA-seq FastQs for A.pulchra and evaluate them using FastQC and MultiQC(Ewels et al. 2016).

1 Create a Bash variables file

This allows usage of Bash variables across R Markdown chunks.

{
echo "#### Assign Variables ####"
echo ""

echo "# Data directories"
echo 'export timeseries_dir=/home/shared/8TB_HDD_02/shedurkin/timeseries_molecular'
echo 'export output_dir_top=${timeseries_dir}/D-Apul/output/00.00-D-Apul-RNAseq-reads-FastQC-MultiQC'
echo 'export raw_fastqc_dir=${output_dir_top}/raw-fastqc'
echo 'export raw_reads_dir=${timeseries_dir}/D-Apul/data/raw-fastqs'
echo 'export raw_reads_url="https://owl.fish.washington.edu/nightingales/E5-coral-time-series/30-1047560508/"'
echo ""

echo "# Paths to programs"
echo 'export fastqc=/home/shared/FastQC-0.12.1/fastqc'
echo 'export multiqc=/home/sam/programs/mambaforge/bin/multiqc'
echo ""

echo "# Set FastQ filename patterns"
echo "export fastq_pattern='*.fastq.gz'"
echo "export R1_fastq_pattern='*_R1_*.fastq.gz'"
echo "export R2_fastq_pattern='*_R2_*.fastq.gz'"
echo ""

echo "# Set number of CPUs to use"
echo 'export threads=40'
echo ""


echo "## Inititalize arrays"
echo 'export fastq_array_R1=()'
echo 'export fastq_array_R2=()'
echo 'export raw_fastqs_array=()'
echo 'export R1_names_array=()'
echo 'export R2_names_array=()'
echo ""

echo "# Programs associative array"
echo "declare -A programs_array"
echo "programs_array=("
echo '[fastqc]="${fastqc}" \'
echo '[multiqc]="${multiqc}" \'
echo ")"
echo ""

echo "# Print formatting"
echo 'export line="--------------------------------------------------------"'
echo ""
} > .bashvars

cat .bashvars
#### Assign Variables ####

# Data directories
export timeseries_dir=/home/shared/8TB_HDD_02/shedurkin/timeseries_molecular
export output_dir_top=${timeseries_dir}/D-Apul/output/00.00-D-Apul-RNAseq-reads-FastQC-MultiQC
export raw_fastqc_dir=${output_dir_top}/raw-fastqc
export raw_reads_dir=${timeseries_dir}/D-Apul/data/raw-fastqs
export raw_reads_url="https://owl.fish.washington.edu/nightingales/E5-coral-time-series/30-1047560508/"

# Paths to programs
export fastqc=/home/shared/FastQC-0.12.1/fastqc
export multiqc=/home/sam/programs/mambaforge/bin/multiqc

# Set FastQ filename patterns
export fastq_pattern='*.fastq.gz'
export R1_fastq_pattern='*_R1_*.fastq.gz'
export R2_fastq_pattern='*_R2_*.fastq.gz'

# Set number of CPUs to use
export threads=40

## Inititalize arrays
export fastq_array_R1=()
export fastq_array_R2=()
export raw_fastqs_array=()
export R1_names_array=()
export R2_names_array=()

# Programs associative array
declare -A programs_array
programs_array=(
[fastqc]="${fastqc}" \
[multiqc]="${multiqc}" \
)

# Print formatting
export line="--------------------------------------------------------"

2 Download A.pulchra RNA-seq FastQs

2.1 Download raw RNA-seq reads

Reads are downloaded from https://owl.fish.washington.edu/nightingales/E5-coral-time-series/30-1047560508/

Since sequencing included multiple species, the code will also parse only those that are A.pulchra.

The --cut-dirs 3 command cuts the preceding directory structure (i.e. nightingales/E5-coral-time-series/30-1047560508/) so that we just end up with the reads.

# Load bash variables into memory
source .bashvars

# Make output directory if it doesn't exist
mkdir --parents ${raw_reads_dir}

# Create list of only A.pulchra sample names
sample_list=$(awk -F "," '$6 ~ /^ACR/ {print $5}' ${timeseries_dir}/M-multi-species/data/rna_metadata.csv)

echo ""
echo "${line}"
echo ""
echo "Sample list:"
echo ""
echo "${sample_list}"
echo ""
echo "${line}"
echo ""


# Use printf to format each item for use in wget
formatted_list=$(printf "*%s_*," ${sample_list})

# Remove the trailing comma
formatted_list=${formatted_list%,}

# Output the final wget command
echo ""
echo "${line}"
echo ""
echo "Formatted wget accept list:"
echo ""
echo "wget --accept=\"$formatted_list\""
echo ""
echo "${line}"
echo ""

# Run wget to retrieve FastQs and MD5 files
# Note: the --no-clobber command will skip re-downloading any files that are already present in the output directory
wget \
--directory-prefix ${raw_reads_dir} \
--recursive \
--no-check-certificate \
--continue \
--cut-dirs 3 \
--no-host-directories \
--no-parent \
--quiet \
--no-clobber \
--accept=${formatted_list} ${raw_reads_url}

ls -lh "${raw_reads_dir}"

2.2 Verify raw read checksums

# Load bash variables into memory
source .bashvars

cd "${raw_reads_dir}"

# Checksums file contains other files, so this just looks for the RNAseq files.
for file in *.md5
do
  md5sum --check "${file}"
done

3 FastQC/MultiQC on raw reads

# Load bash variables into memory
source .bashvars

# Make output directory if it doesn't exist
mkdir --parents "${raw_fastqc_dir}"

############ RUN FASTQC ############


# Create array of trimmed FastQs
raw_fastqs_array=(${raw_reads_dir}/${fastq_pattern})

# Pass array contents to new variable as space-delimited list
raw_fastqc_list=$(echo "${raw_fastqs_array[*]}")

echo "Beginning FastQC on raw reads..."
echo ""

# Run FastQC
### NOTE: Do NOT quote raw_fastqc_list
${programs_array[fastqc]} \
--threads ${threads} \
--outdir ${raw_fastqc_dir} \
--quiet \
${raw_fastqc_list}

echo "FastQC on raw reads complete!"
echo ""

############ END FASTQC ############

############ RUN MULTIQC ############
echo "Beginning MultiQC on raw FastQC..."
echo ""

${programs_array[multiqc]} ${raw_fastqc_dir} -o ${raw_fastqc_dir}

echo ""
echo "MultiQC on raw FastQs complete."
echo ""

############ END MULTIQC ############

echo "Removing FastQC zip files."
echo ""
rm ${raw_fastqc_dir}/*.zip
echo "FastQC zip files removed."
echo ""
# Load bash variables into memory
source .bashvars

# View directory contents
ls -lh ${raw_fastqc_dir}
Ewels, Philip, Måns Magnusson, Sverker Lundin, and Max Käller. 2016. “MultiQC: Summarize Analysis Results for Multiple Tools and Samples in a Single Report.” Bioinformatics 32 (19): 3047–48. https://doi.org/10.1093/bioinformatics/btw354.
LS0tCnRpdGxlOiAiMDAuMDAtRC1BcHVsLVJOQXNlcS1yZWFkcy1GYXN0UUMtTXVsdGlRQy5SbWQiCmF1dGhvcjogIlNhbSBXaGl0ZSIKZGF0ZTogIjIwMjQtMTAtMDQiCm91dHB1dDogCiAgYm9va2Rvd246Omh0bWxfZG9jdW1lbnQyOgogICAgdGhlbWU6IGNvc21vCiAgICB0b2M6IHRydWUKICAgIHRvY19mbG9hdDogdHJ1ZQogICAgbnVtYmVyX3NlY3Rpb25zOiB0cnVlCiAgICBjb2RlX2ZvbGRpbmc6IHNob3cKICAgIGNvZGVfZG93bmxvYWQ6IHRydWUKICBnaXRodWJfZG9jdW1lbnQ6CiAgICB0b2M6IHRydWUKICAgIG51bWJlcl9zZWN0aW9uczogdHJ1ZQogIGh0bWxfZG9jdW1lbnQ6CiAgICB0aGVtZTogY29zbW8KICAgIHRvYzogdHJ1ZQogICAgdG9jX2Zsb2F0OiB0cnVlCiAgICBudW1iZXJfc2VjdGlvbnM6IHRydWUKICAgIGNvZGVfZm9sZGluZzogc2hvdwogICAgY29kZV9kb3dubG9hZDogdHJ1ZQpiaWJsaW9ncmFwaHk6IHJlZmVyZW5jZXMuYmliCi0tLQoKYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9CmxpYnJhcnkoa25pdHIpCmtuaXRyOjpvcHRzX2NodW5rJHNldCgKICBlY2hvID0gVFJVRSwgICAgICAgICAjIERpc3BsYXkgY29kZSBjaHVua3MKICBldmFsID0gRkFMU0UsICAgICAgICAjIEV2YWx1YXRlIGNvZGUgY2h1bmtzCiAgd2FybmluZyA9IEZBTFNFLCAgICAgIyBIaWRlIHdhcm5pbmdzCiAgbWVzc2FnZSA9IEZBTFNFLCAgICAgIyBIaWRlIG1lc3NhZ2VzCiAgY29tbWVudCA9ICIiICAgICAgICAgIyBQcmV2ZW50cyBhcHBlbmRpbmcgJyMjJyB0byBiZWdpbm5pbmcgb2YgbGluZXMgaW4gY29kZSBvdXRwdXQKKQpgYGAKClRoaXMgUm1kIGZpbGUgd2lsbCBkb3dubG9hZCByYXcgUk5BLXNlcSBGYXN0UXMgZm9yICpBLnB1bGNocmEqIGFuZCBldmFsdWF0ZSB0aGVtIHVzaW5nIFtGYXN0UUNdKGh0dHBzOi8vZ2l0aHViLmNvbS9zLWFuZHJld3MvRmFzdFFDKSBhbmQgW011bHRpUUNdKGh0dHBzOi8vbXVsdGlxYy5pbmZvLylbQGV3ZWxzMjAxNl0uCgojIENyZWF0ZSBhIEJhc2ggdmFyaWFibGVzIGZpbGUKClRoaXMgYWxsb3dzIHVzYWdlIG9mIEJhc2ggdmFyaWFibGVzIGFjcm9zcyBSIE1hcmtkb3duIGNodW5rcy4KCmBgYHtyIHNhdmUtYmFzaC12YXJpYWJsZXMtdG8tcnZhcnMtZmlsZSwgZW5naW5lPSdiYXNoJywgZXZhbD1UUlVFfQp7CmVjaG8gIiMjIyMgQXNzaWduIFZhcmlhYmxlcyAjIyMjIgplY2hvICIiCgplY2hvICIjIERhdGEgZGlyZWN0b3JpZXMiCmVjaG8gJ2V4cG9ydCB0aW1lc2VyaWVzX2Rpcj0vaG9tZS9zaGFyZWQvOFRCX0hERF8wMi9zaGVkdXJraW4vdGltZXNlcmllc19tb2xlY3VsYXInCmVjaG8gJ2V4cG9ydCBvdXRwdXRfZGlyX3RvcD0ke3RpbWVzZXJpZXNfZGlyfS9ELUFwdWwvb3V0cHV0LzAwLjAwLUQtQXB1bC1STkFzZXEtcmVhZHMtRmFzdFFDLU11bHRpUUMnCmVjaG8gJ2V4cG9ydCByYXdfZmFzdHFjX2Rpcj0ke291dHB1dF9kaXJfdG9wfS9yYXctZmFzdHFjJwplY2hvICdleHBvcnQgcmF3X3JlYWRzX2Rpcj0ke3RpbWVzZXJpZXNfZGlyfS9ELUFwdWwvZGF0YS9yYXctZmFzdHFzJwplY2hvICdleHBvcnQgcmF3X3JlYWRzX3VybD0iaHR0cHM6Ly9vd2wuZmlzaC53YXNoaW5ndG9uLmVkdS9uaWdodGluZ2FsZXMvRTUtY29yYWwtdGltZS1zZXJpZXMvMzAtMTA0NzU2MDUwOC8iJwplY2hvICIiCgplY2hvICIjIFBhdGhzIHRvIHByb2dyYW1zIgplY2hvICdleHBvcnQgZmFzdHFjPS9ob21lL3NoYXJlZC9GYXN0UUMtMC4xMi4xL2Zhc3RxYycKZWNobyAnZXhwb3J0IG11bHRpcWM9L2hvbWUvc2FtL3Byb2dyYW1zL21hbWJhZm9yZ2UvYmluL211bHRpcWMnCmVjaG8gIiIKCmVjaG8gIiMgU2V0IEZhc3RRIGZpbGVuYW1lIHBhdHRlcm5zIgplY2hvICJleHBvcnQgZmFzdHFfcGF0dGVybj0nKi5mYXN0cS5neiciCmVjaG8gImV4cG9ydCBSMV9mYXN0cV9wYXR0ZXJuPScqX1IxXyouZmFzdHEuZ3onIgplY2hvICJleHBvcnQgUjJfZmFzdHFfcGF0dGVybj0nKl9SMl8qLmZhc3RxLmd6JyIKZWNobyAiIgoKZWNobyAiIyBTZXQgbnVtYmVyIG9mIENQVXMgdG8gdXNlIgplY2hvICdleHBvcnQgdGhyZWFkcz00MCcKZWNobyAiIgoKCmVjaG8gIiMjIEluaXRpdGFsaXplIGFycmF5cyIKZWNobyAnZXhwb3J0IGZhc3RxX2FycmF5X1IxPSgpJwplY2hvICdleHBvcnQgZmFzdHFfYXJyYXlfUjI9KCknCmVjaG8gJ2V4cG9ydCByYXdfZmFzdHFzX2FycmF5PSgpJwplY2hvICdleHBvcnQgUjFfbmFtZXNfYXJyYXk9KCknCmVjaG8gJ2V4cG9ydCBSMl9uYW1lc19hcnJheT0oKScKZWNobyAiIgoKZWNobyAiIyBQcm9ncmFtcyBhc3NvY2lhdGl2ZSBhcnJheSIKZWNobyAiZGVjbGFyZSAtQSBwcm9ncmFtc19hcnJheSIKZWNobyAicHJvZ3JhbXNfYXJyYXk9KCIKZWNobyAnW2Zhc3RxY109IiR7ZmFzdHFjfSIgXCcKZWNobyAnW211bHRpcWNdPSIke211bHRpcWN9IiBcJwplY2hvICIpIgplY2hvICIiCgplY2hvICIjIFByaW50IGZvcm1hdHRpbmciCmVjaG8gJ2V4cG9ydCBsaW5lPSItLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLSInCmVjaG8gIiIKfSA+IC5iYXNodmFycwoKY2F0IC5iYXNodmFycwpgYGAKCiMgRG93bmxvYWQgKkEucHVsY2hyYSogUk5BLXNlcSBGYXN0UXMKCiMjIERvd25sb2FkIHJhdyBSTkEtc2VxIHJlYWRzCgpSZWFkcyBhcmUgZG93bmxvYWRlZCBmcm9tIDxodHRwczovL293bC5maXNoLndhc2hpbmd0b24uZWR1L25pZ2h0aW5nYWxlcy9FNS1jb3JhbC10aW1lLXNlcmllcy8zMC0xMDQ3NTYwNTA4Lz4KClNpbmNlIHNlcXVlbmNpbmcgaW5jbHVkZWQgbXVsdGlwbGUgc3BlY2llcywgdGhlIGNvZGUgd2lsbCBhbHNvIHBhcnNlIG9ubHkgdGhvc2UgdGhhdCBhcmUgKkEucHVsY2hyYSouCgpUaGUgYC0tY3V0LWRpcnMgM2AgY29tbWFuZCBjdXRzIHRoZSBwcmVjZWRpbmcgZGlyZWN0b3J5IHN0cnVjdHVyZSAoaS5lLiBgbmlnaHRpbmdhbGVzL0U1LWNvcmFsLXRpbWUtc2VyaWVzLzMwLTEwNDc1NjA1MDgvYCkgc28gdGhhdCB3ZSBqdXN0IGVuZCB1cCB3aXRoIHRoZSByZWFkcy4KCmBgYHtiYXNoIGRvd25sb2FkLXJhdy1yZWFkcywgZW5naW5lPSdiYXNoJ30KIyBMb2FkIGJhc2ggdmFyaWFibGVzIGludG8gbWVtb3J5CnNvdXJjZSAuYmFzaHZhcnMKCiMgTWFrZSBvdXRwdXQgZGlyZWN0b3J5IGlmIGl0IGRvZXNuJ3QgZXhpc3QKbWtkaXIgLS1wYXJlbnRzICR7cmF3X3JlYWRzX2Rpcn0KCiMgQ3JlYXRlIGxpc3Qgb2Ygb25seSBBLnB1bGNocmEgc2FtcGxlIG5hbWVzCnNhbXBsZV9saXN0PSQoYXdrIC1GICIsIiAnJDYgfiAvXkFDUi8ge3ByaW50ICQ1fScgJHt0aW1lc2VyaWVzX2Rpcn0vTS1tdWx0aS1zcGVjaWVzL2RhdGEvcm5hX21ldGFkYXRhLmNzdikKCmVjaG8gIiIKZWNobyAiJHtsaW5lfSIKZWNobyAiIgplY2hvICJTYW1wbGUgbGlzdDoiCmVjaG8gIiIKZWNobyAiJHtzYW1wbGVfbGlzdH0iCmVjaG8gIiIKZWNobyAiJHtsaW5lfSIKZWNobyAiIgoKCiMgVXNlIHByaW50ZiB0byBmb3JtYXQgZWFjaCBpdGVtIGZvciB1c2UgaW4gd2dldApmb3JtYXR0ZWRfbGlzdD0kKHByaW50ZiAiKiVzXyosIiAke3NhbXBsZV9saXN0fSkKCiMgUmVtb3ZlIHRoZSB0cmFpbGluZyBjb21tYQpmb3JtYXR0ZWRfbGlzdD0ke2Zvcm1hdHRlZF9saXN0JSx9CgojIE91dHB1dCB0aGUgZmluYWwgd2dldCBjb21tYW5kCmVjaG8gIiIKZWNobyAiJHtsaW5lfSIKZWNobyAiIgplY2hvICJGb3JtYXR0ZWQgd2dldCBhY2NlcHQgbGlzdDoiCmVjaG8gIiIKZWNobyAid2dldCAtLWFjY2VwdD1cIiRmb3JtYXR0ZWRfbGlzdFwiIgplY2hvICIiCmVjaG8gIiR7bGluZX0iCmVjaG8gIiIKCiMgUnVuIHdnZXQgdG8gcmV0cmlldmUgRmFzdFFzIGFuZCBNRDUgZmlsZXMKIyBOb3RlOiB0aGUgLS1uby1jbG9iYmVyIGNvbW1hbmQgd2lsbCBza2lwIHJlLWRvd25sb2FkaW5nIGFueSBmaWxlcyB0aGF0IGFyZSBhbHJlYWR5IHByZXNlbnQgaW4gdGhlIG91dHB1dCBkaXJlY3RvcnkKd2dldCBcCi0tZGlyZWN0b3J5LXByZWZpeCAke3Jhd19yZWFkc19kaXJ9IFwKLS1yZWN1cnNpdmUgXAotLW5vLWNoZWNrLWNlcnRpZmljYXRlIFwKLS1jb250aW51ZSBcCi0tY3V0LWRpcnMgMyBcCi0tbm8taG9zdC1kaXJlY3RvcmllcyBcCi0tbm8tcGFyZW50IFwKLS1xdWlldCBcCi0tbm8tY2xvYmJlciBcCi0tYWNjZXB0PSR7Zm9ybWF0dGVkX2xpc3R9ICR7cmF3X3JlYWRzX3VybH0KCmxzIC1saCAiJHtyYXdfcmVhZHNfZGlyfSIKYGBgCgojIyBWZXJpZnkgcmF3IHJlYWQgY2hlY2tzdW1zCgpgYGB7YmFzaCB2ZXJpZnktcmF3LXJlYWQtY2hlY2tzdW1zLCBlbmdpbmU9J2Jhc2gnfQojIExvYWQgYmFzaCB2YXJpYWJsZXMgaW50byBtZW1vcnkKc291cmNlIC5iYXNodmFycwoKY2QgIiR7cmF3X3JlYWRzX2Rpcn0iCgojIENoZWNrc3VtcyBmaWxlIGNvbnRhaW5zIG90aGVyIGZpbGVzLCBzbyB0aGlzIGp1c3QgbG9va3MgZm9yIHRoZSBSTkFzZXEgZmlsZXMuCmZvciBmaWxlIGluICoubWQ1CmRvCiAgbWQ1c3VtIC0tY2hlY2sgIiR7ZmlsZX0iCmRvbmUKYGBgCgojIEZhc3RRQy9NdWx0aVFDIG9uIHJhdyByZWFkcwoKYGBge2Jhc2ggcmF3LWZhc3RxYy1tdWx0aXFjLCBlbmdpbmU9J2Jhc2gnfQojIExvYWQgYmFzaCB2YXJpYWJsZXMgaW50byBtZW1vcnkKc291cmNlIC5iYXNodmFycwoKIyBNYWtlIG91dHB1dCBkaXJlY3RvcnkgaWYgaXQgZG9lc24ndCBleGlzdApta2RpciAtLXBhcmVudHMgIiR7cmF3X2Zhc3RxY19kaXJ9IgoKIyMjIyMjIyMjIyMjIFJVTiBGQVNUUUMgIyMjIyMjIyMjIyMjCgoKIyBDcmVhdGUgYXJyYXkgb2YgdHJpbW1lZCBGYXN0UXMKcmF3X2Zhc3Rxc19hcnJheT0oJHtyYXdfcmVhZHNfZGlyfS8ke2Zhc3RxX3BhdHRlcm59KQoKIyBQYXNzIGFycmF5IGNvbnRlbnRzIHRvIG5ldyB2YXJpYWJsZSBhcyBzcGFjZS1kZWxpbWl0ZWQgbGlzdApyYXdfZmFzdHFjX2xpc3Q9JChlY2hvICIke3Jhd19mYXN0cXNfYXJyYXlbKl19IikKCmVjaG8gIkJlZ2lubmluZyBGYXN0UUMgb24gcmF3IHJlYWRzLi4uIgplY2hvICIiCgojIFJ1biBGYXN0UUMKIyMjIE5PVEU6IERvIE5PVCBxdW90ZSByYXdfZmFzdHFjX2xpc3QKJHtwcm9ncmFtc19hcnJheVtmYXN0cWNdfSBcCi0tdGhyZWFkcyAke3RocmVhZHN9IFwKLS1vdXRkaXIgJHtyYXdfZmFzdHFjX2Rpcn0gXAotLXF1aWV0IFwKJHtyYXdfZmFzdHFjX2xpc3R9CgplY2hvICJGYXN0UUMgb24gcmF3IHJlYWRzIGNvbXBsZXRlISIKZWNobyAiIgoKIyMjIyMjIyMjIyMjIEVORCBGQVNUUUMgIyMjIyMjIyMjIyMjCgojIyMjIyMjIyMjIyMgUlVOIE1VTFRJUUMgIyMjIyMjIyMjIyMjCmVjaG8gIkJlZ2lubmluZyBNdWx0aVFDIG9uIHJhdyBGYXN0UUMuLi4iCmVjaG8gIiIKCiR7cHJvZ3JhbXNfYXJyYXlbbXVsdGlxY119ICR7cmF3X2Zhc3RxY19kaXJ9IC1vICR7cmF3X2Zhc3RxY19kaXJ9CgplY2hvICIiCmVjaG8gIk11bHRpUUMgb24gcmF3IEZhc3RRcyBjb21wbGV0ZS4iCmVjaG8gIiIKCiMjIyMjIyMjIyMjIyBFTkQgTVVMVElRQyAjIyMjIyMjIyMjIyMKCmVjaG8gIlJlbW92aW5nIEZhc3RRQyB6aXAgZmlsZXMuIgplY2hvICIiCnJtICR7cmF3X2Zhc3RxY19kaXJ9LyouemlwCmVjaG8gIkZhc3RRQyB6aXAgZmlsZXMgcmVtb3ZlZC4iCmVjaG8gIiIKYGBgCgpgYGB7YmFzaH0KIyBMb2FkIGJhc2ggdmFyaWFibGVzIGludG8gbWVtb3J5CnNvdXJjZSAuYmFzaHZhcnMKCiMgVmlldyBkaXJlY3RvcnkgY29udGVudHMKbHMgLWxoICR7cmF3X2Zhc3RxY19kaXJ9CgpgYGAKCgo=