--- title: "02-trim-primers" output: html_document date: "2023-04-25" --- This workflow is adapted from the following pipeline: https://benjjneb.github.io/dada2/ITS_workflow.html # Get started install r packages ```{r} install.packages('ShortRead') install.packages("BiocManager") BiocManager::install("dada2") install.packages('ShortRead') ``` load packages ```{r} library(BiocManager) library(ShortRead) library("dada2") ``` To get cutadapt, I downloaded the miniconda installer using curl install cutadapt software ```{bash} #download miniconda from url cd ../software curl -O https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-MacOSX-arm64.sh ``` and then moved to the directory it was in by typing `cd software` in the terminal. I then typed bash Miniconda3-py310_23.3.1-0-MacOSX-arm64.sh in the terminal to start the setup process. Conda is located in: /Users/hailaschultz/miniconda3. I checked that installation worked by typing `conda list` in the terminal. In the terminal I then typed: `conda config --add channels defaults conda config --add channels bioconda conda config --add channels conda-forge conda config --set channel_priority strict` and then `conda create -n cutadaptenv cutadapt` On my personal computer, I had to use `CONDA_SUBDIR=osx-64 conda create -n cutadaptenv cutadapt` I ran conda init bash To activate the conda environment, in the terminal I typed `conda activate cutadaptenv` # Remove Primers download fastqc ```{bash} curl -O https://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.11.7.zip ``` use fastqc ```{bash} cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/software/FastQC/fastqc *fastq.gz -o /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/output/raw-read-qc ``` make multiqc report in the terminal I ran `conda install multiqc` to install multiqc ```{bash} cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/output/raw-read-qc multiqc . ``` take a look at data ```{bash} zcat 2018APRP12Ve-COI-48samples_S145_L001_R1_001.fastq.gz | head -n 2 ``` Make lists of forward and reverse read files list files for forward and reverse reads ```{r} #set file path path <- "../data/fastq-files" #make matched list of forward and reverse reads fnFs <- sort(list.files(path, pattern = "_R1_001.fastq.gz", full.names = TRUE)) fnRs <- sort(list.files(path, pattern = "_R2_001.fastq.gz", full.names = TRUE)) #specify primer sequences FWD <- "GGWACWGGWTGAACWGTWTAYCCYCC" REV <- "TANACYTCNGGRTGNCCRAARAAYCA" #in the reverse primer, I replaced I with N ``` Check if primers are actually in data get orientations of primers ```{r} allOrients <- function(primer) { # Create all orientations of the input sequence require(Biostrings) dna <- DNAString(primer) # The Biostrings works w/ DNAString objects rather than character vectors orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna), RevComp = reverseComplement(dna)) return(sapply(orients, toString)) # Convert back to character vector } FWD.orients <- allOrients(FWD) REV.orients <- allOrients(REV) FWD.orients ``` count number of times primers occur ```{r} primerHits <- function(primer, fn) { # Counts number of reads in which the primer is found nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE) return(sum(nhits > 0)) } rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs[[1]]), FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs[[1]]), REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs[[1]]), REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs[[1]])) ``` Tell R the path to the cutadapt command ```{r} cutadapt <- "/Users/hailaschultz/miniconda3/envs/cutadaptenv/bin/cutadapt" system2(cutadapt, args = "--version") ``` use cutadapt ```{r} path.cut <- file.path(path, "cutadapt") if(!dir.exists(path.cut)) dir.create(path.cut) fnFs.cut <- file.path(path.cut, basename(fnFs)) fnRs.cut <- file.path(path.cut, basename(fnRs)) FWD.RC <- dada2:::rc(FWD) REV.RC <- dada2:::rc(REV) # Trim FWD and the reverse-complement of REV off of R1 (forward reads) R1.flags <- paste("-g", FWD, "-a", REV.RC) # Trim REV and the reverse-complement of FWD off of R2 (reverse reads) R2.flags <- paste("-G", REV, "-A", FWD.RC) # Run Cutadapt for(i in seq_along(fnFs)) { system2(cutadapt, args = c(R1.flags, R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads "-o", fnFs.cut[i], "-p", fnRs.cut[i], # output files fnFs[i], fnRs[i])) # input files } ``` Check if adapters were trimmed for one sample ```{r} rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]), FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]]), REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.cut[[1]]), REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]])) #there are 5 forward reads that weren't trimmed... don't know why this is ``` QC check on files after primers were trimmed use fastqc ```{bash} cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files/cutadapt /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/software/FastQC/fastqc *fastq.gz -o /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/output/after-cutadapt-qc ``` make multiqc report ```{bash} cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/output/after-cutadapt-qc multiqc . ```