--- title: "Dada2_report" author: "XX XXXX" date: "`r format(Sys.time(), '%B %d, %Y')`" output: html_document params: folder: value: /cloud/project/output.test/noprimers hash: TRUE trimming.length.Read1: 200 trimming.length.Read2: 200 metadata: output.metadata.csv output.folder: outputs keep.mid.files: TRUE --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) knitr::opts_knit$set(root.dir = params$folder) ``` ## Dada2 report You have successfully used `cutadapt` to split the .FASTQ files by locus and remove the primer sequence. Not a small feat. Now we are going to apply a denoising algorithm `DADA2`, https://github.com/benjjneb/dada2, and estimate the original sample composition. On the parameters section you have chosen a trimming length for each of the two files, whether to use Hashes or not, and a folder where to drop the output files First load the packages. And let's update the parameters we need ```{r loading packages, echo=TRUE ,message=FALSE} library (tidyverse) library (dada2) library (digest) library (seqinr) library (knitr) library (kableExtra) sample.metadata <- read_csv(params$metadata) filt_path <- file.path(params$folder, "filtered") getwd() filt_path # Check if output directory exists - if not create as a subfolder of input dir if(!dir.exists(file.path(params$output.folder))){ dir.create(path = file.path(params$output.folder),recursive = T) output.dir <- file.path(params$output.folder) }else{ output.dir <- file.path(params$output.folder) } output.dir # Write the parameters in a file to the output dir tibble(Parameters = names(params), Values = unlist(params)) %>% write_csv(file.path(output.dir,paste0("parameters_", Sys.Date(), ".csv"))) ``` ```{r check quality trim point, message=FALSE, warning=FALSE} ifelse(nrow(sample.metadata)>4, subset <- sample.metadata %>% sample_n(4), subset <- sample.metadata) subset %>% pull(file1) %>% plotQualityProfile(.) subset %>% pull(file2) %>% plotQualityProfile(.) ``` The most common Amplicon length is 302, so trimming to 200 bp each read should give us enough overlap to successfully merge Forward and Reverse reads. Based on the drop of quality from the reverse read, you might choose to make them shorter on the R2. In that case, rerun the script setting the trimming differently. ```{r dadaing} sample.metadata %>% separate(file1, into = "basename", sep= "_L001_R1", remove = F) %>% mutate(filtF1 = file.path("filtered", paste0(basename, "_F1_filt.fastq.gz")), filtR1 = file.path("filtered", paste0(basename, "_R1_filt.fastq.gz"))) %>% select(-basename) %>% mutate (outFs = pmap(.l= list (file1, filtF1, file2, filtR1), .f = function(a, b, c, d) { filterAndTrim(a,b,c,d, truncLen=c(params$trimming.length.Read1,params$trimming.length.Read2), maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=TRUE ) } ), errF1 = map(filtF1, ~ learnErrors(.x, multithread=TRUE,verbose = 0)), # Calculate errors errR1 = map(filtR1, ~ learnErrors(.x, multithread=TRUE,verbose = 0)), derepF1 = map(filtF1, derepFastq), # dereplicate seqs derepR1 = map(filtR1, derepFastq), dadaF1 = map2(derepF1,errF1, ~ dada(.x, err = .y, multithread = TRUE)), # dada2 dadaR1 = map2(derepR1,errR1, ~ dada(.x, err = .y, multithread = TRUE)), mergers = pmap(.l = list(dadaF1,derepF1, dadaR1,derepR1), # merge things .f = mergePairs )) -> output.dada2 if ( params$keep.mid.files==TRUE){ write_rds(output.dada2, path = "output.halfway.rds")} seqtabF <- makeSequenceTable(output.dada2$mergers) dim(seqtabF) table(nchar(getSequences(seqtabF))) ``` Now we will remove chimera sequences. This is done within dada2 but you could use other ways ```{r RemovingChimeras, message=F} seqtab.nochim <- removeBimeraDenovo(seqtabF, method="consensus", multithread=TRUE) dim(seqtab.nochim) seqtab.nochim.df <- as.data.frame(seqtab.nochim) ``` ```{r setting up output files} # Copy the metadata so it is all in one place sample.metadata %>% write_csv(file.path(output.dir,"metadata.csv")) # Output files conv_file <- file.path(output.dir,"hash_key.csv") conv_file.fasta <- file.path(output.dir,"hash_key.fasta") ASV_file <- file.path(output.dir,"ASV_table.csv") print (conv_file) print (conv_file.fasta) print(ASV_file) ``` ```{r Hash or not} if ( params$hash==TRUE) { conv_table <- tibble( Hash = "", Sequence ="") map_chr (colnames(seqtab.nochim.df), ~ digest(.x, algo = "sha1", serialize = F, skip = "auto")) -> Hashes conv_table <- tibble (Hash = Hashes, Sequence = colnames(seqtab.nochim.df)) colnames(seqtab.nochim.df) <- Hashes write_csv(conv_table, conv_file) # write the table into a file write.fasta(sequences = as.list(conv_table$Sequence), names = as.list(conv_table$Hash), file.out = conv_file.fasta) seqtab.nochim.df <- bind_cols(sample.metadata %>% select(Sample_name, Locus), seqtab.nochim.df) seqtab.nochim.df %>% pivot_longer(cols = c(- Sample_name, - Locus), names_to = "Hash", values_to = "nReads") %>% filter(nReads > 0) -> current_asv write_csv(current_asv, ASV_file) }else{ #What do we do if you don't want hashes: two things - Change the header of the ASV table, write only one file seqtab.nochim.df %>% pivot_longer(cols = c(- Sample_name, - Locus), names_to = "Sequence", values_to = "nReads") %>% filter(nReads > 0) -> current_asv write_csv(current_asv, ASV_file) } ``` ## Track the fate of all reads ```{r output_summary} getN <- function(x) sum(getUniques(x)) output.dada2 %>% select(-file1, -file2, -filtF1, -filtR1, -errF1, -errR1, -derepF1, -derepR1) %>% mutate_at(.vars = c("dadaF1", "dadaR1", "mergers"), ~ sapply(.x,getN)) %>% # pull(outFs) -> test mutate(input = map_dbl(outFs, ~ .x[1]), filtered = map_dbl(outFs, ~ .x[2]), tabled = rowSums(seqtabF), nonchim = rowSums(seqtab.nochim)) %>% select(Sample_name, Locus, input, filtered, denoised_F = dadaF1, denoised_R = dadaR1, merged = mergers, tabled, nonchim) -> track write_csv(track, file.path(output.dir,"dada2_summary.csv")) ``` ```{r drop} if ( params$keep.mid.files==FALSE){ unlink(filt_path, recursive = T) } ``` ```{r output_summary table and fig} kable (track, align= "c", format = "html") %>% kable_styling(bootstrap_options= "striped", full_width = T, position = "center") %>% column_spec(2, bold=T) ``` ```{r output_summary plot} track %>% mutate_if(is.numeric, as.integer) %>% pivot_longer(cols = c(-Sample_name, -Locus), names_to = "Step", values_to = "Number of Sequences") %>% mutate (Step = fct_relevel(Step, levels = c( "input","filtered","denoised_F" ,"denoised_R" , "merged" , "tabled", "nonchim"))) %>% ggplot(aes(x = Step, y = `Number of Sequences`, group = Sample_name, color = Sample_name)) + geom_line() + facet_wrap(~Sample_name) + guides(color = "none") ```