Dada2 report

You have successfully split your libraries into a pair (or two) of fastq files per sample. Now let’s import the output of demultiplex_both_fastq.sh

First load the packages. And let’s update the parameters we need

## Warning: package 'S4Vectors' was built under R version 3.6.2
## Warning: package 'IRanges' was built under R version 3.6.2
## [1] "/Users/ramon.gallegosimon/test/noprimers"
## [1] "~/test/noprimers/filtered"
sample.metadata %>% 
  sample_n(4) %>% pull(file1) %>% 
  plotQualityProfile(.)

sample.metadata %>% 
  sample_n(4) %>% pull(file2) %>% 
  plotQualityProfile(.)

The most common Amplicon length is 302, so trimming to 200 bp each read should give us enough overlap to successfully merge Forward and Reverse reads. Based on the drop of quality from the reverse read, you might choose to make them shorter on the R2. In that case, rerun the script setting the trimming differently.

sample.metadata %>% 
  mutate(filtF1 = file.path("filtered", str_replace(file1, "_L001_R1_001.fastq_sub.fastq", "_F1_filt.fastq.gz")),
         filtR1 = file.path("filtered", str_replace(file2, "_L001_R2_001.fastq_sub.fastq", "_R1_filt.fastq.gz"))) %>% 
  mutate (outFs = pmap(.l= list (file1, filtF1, file2, filtR1),
                       .f = function(a, b, c, d) {
                         filterAndTrim(a,b,c,d,
                                       truncLen=c(params$trimming.length.Read1,params$trimming.length.Read2),
                                       maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE,
                                       compress=TRUE, multithread=TRUE )
                       } ),
          errF1 = map(filtF1, ~ learnErrors(.x, multithread=TRUE,verbose = 0)),     # Calculate errors
          errR1 = map(filtR1, ~ learnErrors(.x, multithread=TRUE,verbose = 0)),
          derepF1 = map(filtF1, derepFastq),                   # dereplicate seqs
          derepR1 = map(filtR1, derepFastq),
          dadaF1  = map2(derepF1,errF1, ~ dada(.x, err = .y, multithread = TRUE)),  # dada2
          dadaR1  = map2(derepR1,errR1, ~ dada(.x, err = .y, multithread = TRUE)),
          mergers = pmap(.l = list(dadaF1,derepF1, dadaR1,derepR1),                 # merge things
                         .f = mergePairs )) -> output.dada2
## Sample 1 - 429 reads in 220 unique sequences.
## Sample 1 - 516 reads in 261 unique sequences.
## Sample 1 - 650 reads in 267 unique sequences.
## Sample 1 - 735 reads in 287 unique sequences.
## Sample 1 - 698 reads in 200 unique sequences.
## Sample 1 - 722 reads in 294 unique sequences.
## Sample 1 - 736 reads in 289 unique sequences.
## Sample 1 - 666 reads in 258 unique sequences.
## Sample 1 - 719 reads in 369 unique sequences.
## Sample 1 - 636 reads in 270 unique sequences.
## Sample 1 - 732 reads in 328 unique sequences.
## Sample 1 - 674 reads in 268 unique sequences.
## Sample 1 - 573 reads in 268 unique sequences.
## Sample 1 - 651 reads in 292 unique sequences.
## Sample 1 - 655 reads in 288 unique sequences.
## Sample 1 - 429 reads in 311 unique sequences.
## Sample 1 - 516 reads in 361 unique sequences.
## Sample 1 - 650 reads in 411 unique sequences.
## Sample 1 - 735 reads in 436 unique sequences.
## Sample 1 - 698 reads in 376 unique sequences.
## Sample 1 - 722 reads in 479 unique sequences.
## Sample 1 - 736 reads in 460 unique sequences.
## Sample 1 - 666 reads in 415 unique sequences.
## Sample 1 - 719 reads in 493 unique sequences.
## Sample 1 - 636 reads in 453 unique sequences.
## Sample 1 - 732 reads in 467 unique sequences.
## Sample 1 - 674 reads in 428 unique sequences.
## Sample 1 - 573 reads in 426 unique sequences.
## Sample 1 - 651 reads in 469 unique sequences.
## Sample 1 - 655 reads in 464 unique sequences.
write_rds(output.dada2, path = "output.halfway.rds")
seqtabF <- makeSequenceTable(output.dada2$mergers)
dim(seqtabF)
## [1] 15 22
table(nchar(getSequences(seqtabF)))
## 
## 200 302 304 330 331 342 
##   1  12   1   1   1   6

Firstly, we’ll find the patterns that separate our Fwd, Rev, .1 and .2 files. look at the quality of the .1 and .2 reads Run dada 2 step by step

seqtab.nochim <- removeBimeraDenovo(seqtabF, method="consensus", multithread=TRUE)
dim(seqtab.nochim)
## [1] 15 15
seqtab.nochim.df <- as.data.frame(seqtab.nochim)
# Output files
  conv_file <- file.path(output.dir,"hash_key.csv")
  conv_file.fasta <- file.path(output.dir,"hash_key.fasta")
  
  ASV_file <-  file.path(output.dir,"ASV_table.csv")
# Now decide if you want hashing or not
if ( params$hash==TRUE)
  {
  
  conv_table <- tibble( Hash = "", Sequence ="")
  
  hashes <- list(NULL)
 
######## Create the Hash key file
  
  for (i in 1:ncol(seqtab.nochim.df)) {   #for each column of the dataframe
    current_seq <-colnames(seqtab.nochim.df)[i] # Take the whole sequence from each column of the dataframe
    current_hash <- digest(current_seq,algo = "sha1",serialize = F,skip = "auto") # Hash it so it is both shorter and unique
    hashes[[i]] = current_hash
    conv_table [i,]<-c(current_hash, current_seq) # add the Hash - sequence conversion to a table
    colnames(seqtab.nochim.df)[i] <- current_hash
  }
  write_csv(conv_table, conv_file) # write the table into a file
  
  write.fasta(sequences = as.list(conv_table$Sequence),
              names     = as.list(conv_table$Hash),
              file.out = conv_file.fasta)

 seqtab.nochim.df <- bind_cols(sample.metadata %>% 
                                select(Sample_name, Locus),
                              seqtab.nochim.df)


seqtab.nochim.df %>% 
  pivot_longer(cols = c(- Sample_name, - Locus),
              names_to = "Hash",
              values_to = "nReads") %>% 
  filter(nReads > 0) -> current_asv
write_csv(current_asv, ASV_file)    }else{
  #What do we do if you don't want hashes: two things - Change the header of the ASV table, write only one file

   seqtab.nochim.df %>% 
  pivot_longer(cols = c(- Sample_name, - Locus),
              names_to = "Sequence",
              values_to = "nReads") %>% 
  filter(nReads > 0) -> current_asv
write_csv(current_asv, ASV_file) 
  
}

##Track the fate of all reads

getN <- function(x) sum(getUniques(x))

output.dada2 %>% 
  select(-file1, -file2, -filtF1, -filtR1, -errF1, -errR1, -derepF1, -derepR1) %>% 
  mutate_at(.vars = c("dadaF1", "dadaR1", "mergers"),
            ~ sapply(.x,getN)) %>% 
#  pull(outFs) -> test
  mutate(input = map_dbl(outFs, ~ .x[1]),
         filtered = map_dbl(outFs, ~ .x[2]),
         tabled  = rowSums(seqtabF),
         nonchim = rowSums(seqtab.nochim)) %>% 
  select(Sample_name,
         Locus,
         input,
         filtered,
         denoised_F = dadaF1,
         denoised_R = dadaR1,
         merged = mergers,
         tabled,
         nonchim) -> track

write_csv(track, file.path(params$output.folder,"dada2_summary.csv"))

kable (track, align= "c", format = "html") %>%
      kable_styling(bootstrap_options= "striped", full_width = T, position = "center") %>%
      column_spec(2, bold=T) 
Sample_name Locus input filtered denoised_F denoised_R merged tabled nonchim
51530-026_S79 16S_Prey 996 429 421 429 421 421 421
51530-027_S4 16S_Prey 995 516 510 509 509 509 452
51530-029_S14 16S_Prey 993 650 646 641 625 625 584
51530-031_S26 16S_Prey 997 735 730 734 729 729 729
51530-032_S37 16S_Prey 991 698 698 698 698 698 698
51530-033_S48 16S_Prey 992 722 720 720 720 720 720
51530-034_S59 16S_Prey 990 736 735 736 735 735 735
51530-035_S69 16S_Prey 990 666 664 664 662 662 662
51530-036_S80 16S_Prey 995 719 718 719 670 670 655
51530-037_S5 16S_Prey 986 636 634 629 554 554 554
51530-038_S15 16S_Prey 996 732 728 732 728 728 728
51530-039_S27 16S_Prey 992 674 672 670 670 670 670
51530-040_S38 16S_Prey 987 573 568 573 531 531 514
51530-041_S49 16S_Prey 993 651 651 649 649 649 649
51530-042_S60 16S_Prey 994 655 653 654 643 643 643
track %>% 
  mutate_if(is.numeric, as.integer) %>% 
  pivot_longer(cols = c(-Sample_name, -Locus),
               names_to = "Step",
               values_to = "Number of Sequences") %>% 
  mutate (Step = fct_relevel(Step, 
                             levels = c( "input","filtered","denoised_F" ,"denoised_R" , "merged" , "tabled", "nonchim"))) %>% 
  ggplot(aes(x = Step, y = `Number of Sequences`, group =  Sample_name, color = Sample_name)) +
  geom_line() +
  facet_wrap(~Locus) +
  guides(color = "none")
## Warning: Outer names are only allowed for unnamed scalar atomic inputs