--- title: "03-filter-reads" output: html_document date: "2023-05-04" --- # Inspect read quality Get sample names ```{r} # Forward and reverse fastq filenames have the format: cutFs <- sort(list.files(path.cut, pattern = "_R1_001.fastq.gz", full.names = TRUE)) cutRs <- sort(list.files(path.cut, pattern = "_R2_001.fastq.gz", full.names = TRUE)) # Extract sample names, assuming filenames have format: get.sample.name <- function(fname) strsplit(basename(fname), "_")[[1]][1] sample.names <- unname(sapply(cutFs, get.sample.name)) head(sample.names) ``` ```{r} plotQualityProfile(cutFs[1:2]) ``` ```{r} #reverse read quality plotQualityProfile(cutRs[1:2]) ``` Filter and Trim ```{r} filtFs <- file.path(path.cut, "filtered", basename(cutFs)) filtRs <- file.path(path.cut, "filtered", basename(cutRs)) ``` ```{r} # get sample names to see which files match get.sample.name <- function(fname) strsplit(basename(fname), "_")[[1]][1] sample.namesF <- unname(sapply(cutFs, get.sample.name)) sample.namesR <- unname(sapply(cutRs, get.sample.name)) ``` Check for differences between lists ```{r} difsR <- setdiff(sample.namesR,sample.namesF) difsR difsF <- setdiff(sample.namesF,sample.namesR) difsF grep("2019APRP12VeA-COI-P1",filtFs) grep("2019JULP22VeA-COI-P1",filtFs) grep("2019APRP22Ve-COI-48samples",filtRs) #remove non-matched files indicesF <- c(35,51) filtFs <- filtFs[-indicesF] indicesR <- c(36) filtRs <- filtRs[-indicesR] #do the same for cut files grep("2019APRP12VeA-COI-P1",cutFs) grep("2019JULP22VeA-COI-P1",cutFs) grep("2019APRP22Ve-COI-48samples",cutRs) cutFs <- cutFs[-indicesF] cutRs <- cutRs[-indicesR] ``` filter and trim maxN: allows zero Ns max EE: set to 5 for forward and reverse truncQ: set to 2 minlen: minimum length is 100 reads rm.phix:remove phix ```{r} out <- filterAndTrim(cutFs, filtFs, cutRs, filtRs, maxN = 0, maxEE = c(5, 5), truncQ = 2, minLen = 100, rm.phix = TRUE, compress = TRUE, multithread = TRUE) head(out) ``` Check how many samples went through filtering ```{bash} cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files/cutadapt ls | wc -l cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files/cutadapt/filtered ls | wc -l ``` learn the error rates ```{r} errF <- learnErrors(filtFs, multithread=TRUE) ``` ```{r} errR <- learnErrors(filtRs, multithread=TRUE) ``` plot the errors ```{r} plotErrors(errF, nominalQ=TRUE) #looks okay! good to proceed ``` apply core sample inference algorithm ```{r} dadaFs <- dada(filtFs, err=errF, multithread=TRUE) ```