---
title: "03-filter-reads"
output: html_document
date: "2023-05-04"
---
# Inspect read quality

Get sample names
```{r}
# Forward and reverse fastq filenames have the format:
cutFs <- sort(list.files(path.cut, pattern = "_R1_001.fastq.gz", full.names = TRUE))
cutRs <- sort(list.files(path.cut, pattern = "_R2_001.fastq.gz", full.names = TRUE))

# Extract sample names, assuming filenames have format:
get.sample.name <- function(fname) strsplit(basename(fname), "_")[[1]][1]
sample.names <- unname(sapply(cutFs, get.sample.name))
head(sample.names)
```


```{r}
plotQualityProfile(cutFs[1:2])
```
```{r}
#reverse read quality
plotQualityProfile(cutRs[1:2])
```

Filter and Trim
```{r}
filtFs <- file.path(path.cut, "filtered", basename(cutFs))
filtRs <- file.path(path.cut, "filtered", basename(cutRs))
```

```{r}
# get sample names to see which files match
get.sample.name <- function(fname) strsplit(basename(fname), "_")[[1]][1]
sample.namesF <- unname(sapply(cutFs, get.sample.name))
sample.namesR <- unname(sapply(cutRs, get.sample.name))
```

Check for differences between lists
```{r}
difsR <- setdiff(sample.namesR,sample.namesF)
difsR
difsF <- setdiff(sample.namesF,sample.namesR)
difsF
grep("2019APRP12VeA-COI-P1",filtFs)
grep("2019JULP22VeA-COI-P1",filtFs)
grep("2019APRP22Ve-COI-48samples",filtRs)

#remove non-matched files
indicesF <- c(35,51)
filtFs <- filtFs[-indicesF]

indicesR <- c(36)
filtRs <- filtRs[-indicesR]

#do the same for cut files
grep("2019APRP12VeA-COI-P1",cutFs)
grep("2019JULP22VeA-COI-P1",cutFs)
grep("2019APRP22Ve-COI-48samples",cutRs)

cutFs <- cutFs[-indicesF]
cutRs <- cutRs[-indicesR]

```


filter and trim
maxN: allows zero Ns
max EE: set to 5 for forward and reverse
truncQ: set to 2
minlen: minimum length is 100 reads
rm.phix:remove phix
```{r}
out <- filterAndTrim(cutFs, filtFs, cutRs, filtRs, maxN = 0, maxEE = c(5, 5), 
    truncQ = 2, minLen = 100, rm.phix = TRUE, compress = TRUE, multithread = TRUE) 
head(out)
```

Check how many samples went through filtering
```{bash}
cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files/cutadapt

ls | wc -l

cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files/cutadapt/filtered

ls | wc -l
```

learn the error rates
```{r}
errF <- learnErrors(filtFs, multithread=TRUE)
```
```{r}
errR <- learnErrors(filtRs, multithread=TRUE)
```

plot the errors
```{r}
plotErrors(errF, nominalQ=TRUE)
#looks okay! good to proceed
```


apply core sample inference algorithm
```{r}
dadaFs <- dada(filtFs, err=errF, multithread=TRUE)
```