Dada2_report

Dada2 report

You have successfully split your libraries into a pair (or two) of fastq files per sample. Now let’s import the output of demultiplex_both_fastq.sh

First load the packages. And let’s update the parameters we need

## Warning: package 'S4Vectors' was built under R version 3.6.2

## Warning: package 'IRanges' was built under R version 3.6.2

## [1] "/Users/ramon.gallegosimon/test/noprimers"

## [1] "~/test/noprimers/filtered"

sample.metadata %>% 
  sample_n(4) %>% pull(file1) %>% 
  plotQualityProfile(.)

sample.metadata %>% 
  sample_n(4) %>% pull(file2) %>% 
  plotQualityProfile(.)

The most common Amplicon length is 302, so trimming to 200 bp each read should give us enough overlap to successfully merge Forward and Reverse reads. Based on the drop of quality from the reverse read, you might choose to make them shorter on the R2. In that case, rerun the script setting the trimming differently.

sample.metadata %>% 
  mutate(filtF1 = file.path("filtered", str_replace(file1, "_L001_R1_001.fastq_sub.fastq", "_F1_filt.fastq.gz")),
         filtR1 = file.path("filtered", str_replace(file2, "_L001_R2_001.fastq_sub.fastq", "_R1_filt.fastq.gz"))) %>% 
  mutate (outFs = pmap(.l= list (file1, filtF1, file2, filtR1),
                       .f = function(a, b, c, d) {
                         filterAndTrim(a,b,c,d,
                                       truncLen=c(params$trimming.length.Read1,params$trimming.length.Read2),
                                       maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE,
                                       compress=TRUE, multithread=TRUE )
                       } ),
          errF1 = map(filtF1, ~ learnErrors(.x, multithread=TRUE,verbose = 0)),     # Calculate errors
          errR1 = map(filtR1, ~ learnErrors(.x, multithread=TRUE,verbose = 0)),
          derepF1 = map(filtF1, derepFastq),                   # dereplicate seqs
          derepR1 = map(filtR1, derepFastq),
          dadaF1  = map2(derepF1,errF1, ~ dada(.x, err = .y, multithread = TRUE)),  # dada2
          dadaR1  = map2(derepR1,errR1, ~ dada(.x, err = .y, multithread = TRUE)),
          mergers = pmap(.l = list(dadaF1,derepF1, dadaR1,derepR1),                 # merge things
                         .f = mergePairs )) -> output.dada2

## Sample 1 - 429 reads in 220 unique sequences.
## Sample 1 - 516 reads in 261 unique sequences.
## Sample 1 - 650 reads in 267 unique sequences.
## Sample 1 - 735 reads in 287 unique sequences.
## Sample 1 - 698 reads in 200 unique sequences.
## Sample 1 - 722 reads in 294 unique sequences.
## Sample 1 - 736 reads in 289 unique sequences.
## Sample 1 - 666 reads in 258 unique sequences.
## Sample 1 - 719 reads in 369 unique sequences.
## Sample 1 - 636 reads in 270 unique sequences.
## Sample 1 - 732 reads in 328 unique sequences.
## Sample 1 - 674 reads in 268 unique sequences.
## Sample 1 - 573 reads in 268 unique sequences.
## Sample 1 - 651 reads in 292 unique sequences.
## Sample 1 - 655 reads in 288 unique sequences.
## Sample 1 - 429 reads in 311 unique sequences.
## Sample 1 - 516 reads in 361 unique sequences.
## Sample 1 - 650 reads in 411 unique sequences.
## Sample 1 - 735 reads in 436 unique sequences.
## Sample 1 - 698 reads in 376 unique sequences.
## Sample 1 - 722 reads in 479 unique sequences.
## Sample 1 - 736 reads in 460 unique sequences.
## Sample 1 - 666 reads in 415 unique sequences.
## Sample 1 - 719 reads in 493 unique sequences.
## Sample 1 - 636 reads in 453 unique sequences.
## Sample 1 - 732 reads in 467 unique sequences.
## Sample 1 - 674 reads in 428 unique sequences.
## Sample 1 - 573 reads in 426 unique sequences.
## Sample 1 - 651 reads in 469 unique sequences.
## Sample 1 - 655 reads in 464 unique sequences.

write_rds(output.dada2, path = "output.halfway.rds")
seqtabF <- makeSequenceTable(output.dada2$mergers)
dim(seqtabF)

## [1] 15 22

table(nchar(getSequences(seqtabF)))

## 
## 200 302 304 330 331 342 
##   1  12   1   1   1   6

Firstly, we’ll find the patterns that separate our Fwd, Rev, .1 and .2 files. look at the quality of the .1 and .2 reads Run dada 2 step by step

seqtab.nochim <- removeBimeraDenovo(seqtabF, method="consensus", multithread=TRUE)
dim(seqtab.nochim)

## [1] 15 15

seqtab.nochim.df <- as.data.frame(seqtab.nochim)

# Output files
  conv_file <- file.path(output.dir,"hash_key.csv")
  conv_file.fasta <- file.path(output.dir,"hash_key.fasta")
  
  ASV_file <-  file.path(output.dir,"ASV_table.csv")
# Now decide if you want hashing or not
if ( params$hash==TRUE)
  {
  
  conv_table <- tibble( Hash = "", Sequence ="")
  
  hashes <- list(NULL)
 
######## Create the Hash key file
  
  for (i in 1:ncol(seqtab.nochim.df)) {   #for each column of the dataframe
    current_seq <-colnames(seqtab.nochim.df)[i] # Take the whole sequence from each column of the dataframe
    current_hash <- digest(current_seq,algo = "sha1",serialize = F,skip = "auto") # Hash it so it is both shorter and unique
    hashes[[i]] = current_hash
    conv_table [i,]<-c(current_hash, current_seq) # add the Hash - sequence conversion to a table
    colnames(seqtab.nochim.df)[i] <- current_hash
  }
  write_csv(conv_table, conv_file) # write the table into a file
  
  write.fasta(sequences = as.list(conv_table$Sequence),
              names     = as.list(conv_table$Hash),
              file.out = conv_file.fasta)

 seqtab.nochim.df <- bind_cols(sample.metadata %>% 
                                select(Sample_name, Locus),
                              seqtab.nochim.df)


seqtab.nochim.df %>% 
  pivot_longer(cols = c(- Sample_name, - Locus),
              names_to = "Hash",
              values_to = "nReads") %>% 
  filter(nReads > 0) -> current_asv
write_csv(current_asv, ASV_file)    }else{
  #What do we do if you don't want hashes: two things - Change the header of the ASV table, write only one file

   seqtab.nochim.df %>% 
  pivot_longer(cols = c(- Sample_name, - Locus),
              names_to = "Sequence",
              values_to = "nReads") %>% 
  filter(nReads > 0) -> current_asv
write_csv(current_asv, ASV_file) 
  
}

##Track the fate of all reads

getN <- function(x) sum(getUniques(x))

output.dada2 %>% 
  select(-file1, -file2, -filtF1, -filtR1, -errF1, -errR1, -derepF1, -derepR1) %>% 
  mutate_at(.vars = c("dadaF1", "dadaR1", "mergers"),
            ~ sapply(.x,getN)) %>% 
#  pull(outFs) -> test
  mutate(input = map_dbl(outFs, ~ .x[1]),
         filtered = map_dbl(outFs, ~ .x[2]),
         tabled  = rowSums(seqtabF),
         nonchim = rowSums(seqtab.nochim)) %>% 
  select(Sample_name,
         Locus,
         input,
         filtered,
         denoised_F = dadaF1,
         denoised_R = dadaR1,
         merged = mergers,
         tabled,
         nonchim) -> track

write_csv(track, file.path(params$output.folder,"dada2_summary.csv"))

kable (track, align= "c", format = "html") %>%
      kable_styling(bootstrap_options= "striped", full_width = T, position = "center") %>%
      column_spec(2, bold=T)

Sample_name	Locus	input	filtered	denoised_F	denoised_R	merged	tabled	nonchim
51530-026_S79	16S_Prey	996	429	421	429	421	421	421
51530-027_S4	16S_Prey	995	516	510	509	509	509	452
51530-029_S14	16S_Prey	993	650	646	641	625	625	584
51530-031_S26	16S_Prey	997	735	730	734	729	729	729
51530-032_S37	16S_Prey	991	698	698	698	698	698	698
51530-033_S48	16S_Prey	992	722	720	720	720	720	720
51530-034_S59	16S_Prey	990	736	735	736	735	735	735
51530-035_S69	16S_Prey	990	666	664	664	662	662	662
51530-036_S80	16S_Prey	995	719	718	719	670	670	655
51530-037_S5	16S_Prey	986	636	634	629	554	554	554
51530-038_S15	16S_Prey	996	732	728	732	728	728	728
51530-039_S27	16S_Prey	992	674	672	670	670	670	670
51530-040_S38	16S_Prey	987	573	568	573	531	531	514
51530-041_S49	16S_Prey	993	651	651	649	649	649	649
51530-042_S60	16S_Prey	994	655	653	654	643	643	643

track %>% 
  mutate_if(is.numeric, as.integer) %>% 
  pivot_longer(cols = c(-Sample_name, -Locus),
               names_to = "Step",
               values_to = "Number of Sequences") %>% 
  mutate (Step = fct_relevel(Step, 
                             levels = c( "input","filtered","denoised_F" ,"denoised_R" , "merged" , "tabled", "nonchim"))) %>% 
  ggplot(aes(x = Step, y = `Number of Sequences`, group =  Sample_name, color = Sample_name)) +
  geom_line() +
  facet_wrap(~Locus) +
  guides(color = "none")

## Warning: Outer names are only allowed for unnamed scalar atomic inputs

Dada2_report

Ramón Gallego

1/9/2018

Dada2 report