--- title: "14.00-heatwave-genetics-trimmed-bwa-alignments" author: "Sam White" date: "2025-07-11" output: github_document: toc: true number_sections: true bookdown::html_document2: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true html_document: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true bibliography: references.bib --- # BACKGROUND This Rmd file will perform the following: 1. Create BWA genome index using [BWA](https://github.com/lh3/bwa) [@li2013], if needed. 2. Align trimmed FastQs (previously performed by Laura Spencer) to the *G.macrocepahlus* genome ([NCBI GCF_031168955.1_ASM3116895v1](https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_031168955.1/)) using [BWA](https://github.com/lh3/bwa) [@li2013]. 3. Use [picard](https://github.com/broadinstitute/picard) [@Picard2019toolkit] to remove duplicates. 4. Use [bamUtil](https://github.com/statgen/bamUtil) [@jun2015] to remove overlaps. 5. Use [samtools](https://www.htslib.org/doc/samtools-depth.html) [@danecek2021] to compute depth. # INPUTS - Genome FastA - BWA genome FastA index # OUTPUTS - Deduplicated, sorted, clipped BAM files - samtools depth file - BAM index files - samtools stats file for BWA alignments - samtools flagstat file for BWA alignments NOTE: Due to the size of the output files, they are not included in the repository. Instead, you can find them in the here: - [https://gannet.fish.washington.edu/gitrepos/project-cod-temperature/output/14.00-heatwave-genetics-trimmed-bwa-alignments/](https://gannet.fish.washington.edu/gitrepos/project-cod-temperature/output/14.00-heatwave-genetics-trimmed-bwa-alignments/) ```{r setup, include=FALSE} library(knitr) library(reticulate) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages comment = "" # Prevents appending '##' to beginning of lines in code output ) ``` # VARIABLES ```{r variables, eval=TRUE} # DIRECTORIES top_output_dir <- file.path("..", "output") output_dir <- file.path(top_output_dir, "14.00-heatwave-genetics-trimmed-bwa-alignments") data_dir <- file.path("..", "data") trimmed_reads_dir <- file.path(data_dir, "trimmed-fastqs-heatwave-genetics") # PROGRAMS programs_dir <- file.path("", "home", "shared") bamUtil_dir <- file.path(programs_dir, "bamUtil-1.0.15", "bin") bwa <- file.path(programs_dir, "bwa-0.7.17", "bwa") conda_env_name <- c("base") conda_path <- c("/home/sam/programs/mambaforge/condabin/conda") mean_depths <- c("/home/shared/16TB_HDD_01/sam/gitrepos/WGSfqs-to-genolikelihoods/mean_cov_ind.py") multiqc <- file.path("", "home", "sam", "programs", "mambaforge", "bin", "multiqc") picard_dir <- file.path(programs_dir, "picard-3.4.0-6-g80ca98e2a", "build", "libs") picard <- file.path(picard_dir, "picard.jar") R1_suffix <- "_trimmed_clipped_R1_paired.fq.gz" R2_suffix <- "_trimmed_clipped_R2_paired.fq.gz" samtools_dir <- file.path(programs_dir, "samtools-1.12") samtools <- file.path(samtools_dir, "samtools") # FILES genome_fasta <- file.path(data_dir, "GCF_031168955.1_ASM3116895v1_genomic.fna") bwa_index <- file.path(data_dir, "bwa-GCF_031168955.1_ASM3116895v1_genomic") # THREADS threads <- "40" # FORMATTING line <- "-----------------------------------------------" # Export these as environment variables for bash chunks. Sys.setenv( bamUtil_dir = bamUtil_dir, bwa = bwa, bwa_index = bwa_index, data_dir = data_dir, genome_fasta = genome_fasta, line = line, mean_depths = mean_depths, multiqc = multiqc, output_dir = output_dir, picard_dir = picard_dir, picard = picard, top_output_dir = top_output_dir, R1_suffix = R1_suffix, R2_suffix = R2_suffix, samtools = samtools, threads = threads, trimmed_reads_dir = trimmed_reads_dir ) ``` # Load Conda environment This is needed to have access to Python3. If this is successful, the first line of output should show that Python being used. E.g. `python: /home/sam/programs/mambaforge/envs/mirmachine_env/bin/python` ```{r load-conda-env, eval=TRUE} use_condaenv(condaenv = conda_env_name, conda = conda_path) # Check successful env loading py_config() ``` # ALIGNMENTS ## Create BWA Index ```{r bwa-index, engine='bash'} # Check if BWA index file exists if [[ ! -f "bwa-GCF_031168955.1_ASM3116895v1_genomic.bwt" ]]; then echo "BWA index file not found. Creating BWA index..." ${bwa} index -p ${data_dir}/${bwa_index} ${data_dir}/${genome_fasta} else echo "BWA index file already exists. Skipping index creation." fi ``` ## Check FastQs ```{r, check-fastqc, engine='bash', eval=TRUE} ls -1 "${trimmed_reads_dir}/"*.fq.gz | wc -l ``` ## Align with BWA This step will produced sorted BAM files. Alignment is done with the `-M` option, which does the following: > Mark shorter split hits as secondary (for Picard compatibility). `samtools view` is run with the `-F 4`. This will exclude unmapped reads. See [https://broadinstitute.github.io/picard/explain-flags.html](https://broadinstitute.github.io/picard/explain-flags.html) and [https://www.htslib.org/doc/samtools-view.html](https://www.htslib.org/doc/samtools-view.html) for more explanation. ```{r, bwa-alignment, engine='bash'} # Make output directory, if it doesn't exist mkdir --parents ${output_dir} # Declare arrays R1_array=() R2_array=() # Populate arrays R1_array=("${trimmed_reads_dir}"/*"${R1_suffix}") R2_array=("${trimmed_reads_dir}"/*"${R2_suffix}") # Run BWA time \ for fastq in "${!R1_array[@]}" do # Remove path from FastQ sample_no_path=${R1_array[fastq]##*/} # Capture just samples name (string before first `_`) sample=${sample_no_path%%_*} echo "${fastq}" echo "${sample}" # Run BWA ${bwa} mem \ -M \ -t ${threads} \ ${bwa_index} \ ${R1_array[fastq]} \ ${R2_array[fastq]} \ 2> ${output_dir}/${sample}-bwa.stderr \ | samtools view -@ 20 -bS -F 4 \ | samtools sort -@ 20 -o ${output_dir}/${sample}.sorted.bam done ``` # MARK DUPLICATES ## Set read groups Downstream analysis with Picard requires BAM files to have read groups, but these were not set earlier. ```{r, set-read-groups, engine='bash'} for bam in ${output_dir}/*.sorted.bam do # Remove path from BAM sample_no_path=${bam##*/} # Capture just samples name (string before first `.`) sample=${sample_no_path%%.*} # Check read group echo "" echo "Read group for ${bam} before samtools:" ${samtools} view -H ${bam} | grep '@RG' echo "" ${samtools} addreplacerg \ -r "@RG\tID:RG1\tSM:${sample}" \ -o ${output_dir}/${sample}-RG.sorted.bam \ --threads ${threads} \ ${bam} echo "Read group for ${output_dir}/${sample}-RG.sorted.bam:" ${samtools} view -H ${output_dir}/${sample}-RG.sorted.bam | grep '@RG' echo "" done ``` ## Picard MarkDuplicates Use Picard to mark duplicates ```{r, picard-dupes, engine='bash'} for bam in ${output_dir}/*RG.sorted.bam do # Remove path from BAM sample_no_path=${bam##*/} # Capture just samples name (string before first `.`) sample=${sample_no_path%%.*} java -jar ${picard} \ MarkDuplicates \ I=${bam} \ O=${output_dir}/${sample}.sorted.dedup.bam \ M=${output_dir}/${sample}-picard_dups.log \ VALIDATION_STRINGENCY=SILENT \ REMOVE_DUPLICATES=true \ 2> ${output_dir}/${sample}-picard_dups.stderr done ``` # CLIP OVERLAPS ```{r, clip-overlaps, engine='bash'} for bam in ${output_dir}/*.sorted.dedup.bam do # Remove path from BAM sample_no_path=${bam##*/} # Capture just samples name (string before first `.`) sample=${sample_no_path%%.*} # Run BamUtils to clip overlaps ${bamUtil_dir}/bam \ clipOverlap \ --in ${bam} \ --out ${output_dir}/${sample}.sorted.dedup.clipped.bam \ --stats \ 2> ${output_dir}/${sample}.sorted.dedup.clipped-bamUtil.stats done ``` # DEPTHS ## samtools depth ```{r, samtools-depth, engine='bash'} for bam in ${output_dir}/*.sorted.dedup.clipped.bam do # Remove path from BAM sample_no_path=${bam##*/} # Capture filename without .bam (string before last `.`) sample=${sample_no_path%.*} # Run BamUtils to clip overlaps ${samtools} depth \ -aa \ ${bam} \ | cut -f 3 \ | gzip \ > ${output_dir}/${sample}.depth.gz done ``` ## Mean depths Uses `mean_cov_ind.py` Python script from https://github.com/letimm/WGSfqs-to-genolikelihoods/tree/main ```{bash, mean-depth-coverage, eval=TRUE} cd ${output_dir} for depth_file in *-RG.sorted.dedup.clipped.depth.gz do ${mean_depths} \ --infile ${depth_file} \ --outfile mean-coverage.tab done head mean-coverage.tab ``` # INDEX BAMS ```{r, index-bams, engine='bash'} cd ${output_dir} for bam in *.sorted.dedup.clipped.bam do ${samtools} index \ ${bam} \ --threads ${threads} done ``` # STATS FILES Generate `samtools stats` and `samtools flagstat` for BWA alignments. ```{r bam-stats, engine='bash'} for bam in ${output_dir}/*-RG.sorted.bam do # Remove path from BAM sample_no_path=${bam##*/} # Capture just samples name (string before first `.`) sample=${sample_no_path%%.*} # samtools stats ${samtools} stats \ ${bam} \ -@ ${threads} \ > ${output_dir}/"${sample_no_path}.stats" # samtools flagstat ${samtools} flagstat \ ${bam} \ -@ {threads} \ > ${output_dir}/"${sample_no_path}.flagstat" done ``` ## MultiQC on Picard MarkdDuplicates and samtools stats/flagstat files ```{r multiqc-stats, engine='bash', eval=TRUE} cd ${output_dir} ${multiqc} --interactive . ``` # REFERENCES