You have successfully split your libraries into a pair (or two) of fastq files per sample. Now let’s import the output of demultiplex_both_fastq.sh
First load the packages. And let’s update the parameters we need
## Warning: package 'S4Vectors' was built under R version 3.6.2
## Warning: package 'IRanges' was built under R version 3.6.2
## [1] "/Users/ramon.gallegosimon/test/noprimers"
## [1] "~/test/noprimers/filtered"
sample.metadata %>%
sample_n(4) %>% pull(file1) %>%
plotQualityProfile(.)
sample.metadata %>%
sample_n(4) %>% pull(file2) %>%
plotQualityProfile(.)
The most common Amplicon length is 302, so trimming to 200 bp each read should give us enough overlap to successfully merge Forward and Reverse reads. Based on the drop of quality from the reverse read, you might choose to make them shorter on the R2. In that case, rerun the script setting the trimming differently.
sample.metadata %>%
mutate(filtF1 = file.path("filtered", str_replace(file1, "_L001_R1_001.fastq_sub.fastq", "_F1_filt.fastq.gz")),
filtR1 = file.path("filtered", str_replace(file2, "_L001_R2_001.fastq_sub.fastq", "_R1_filt.fastq.gz"))) %>%
mutate (outFs = pmap(.l= list (file1, filtF1, file2, filtR1),
.f = function(a, b, c, d) {
filterAndTrim(a,b,c,d,
truncLen=c(params$trimming.length.Read1,params$trimming.length.Read2),
maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE,
compress=TRUE, multithread=TRUE )
} ),
errF1 = map(filtF1, ~ learnErrors(.x, multithread=TRUE,verbose = 0)), # Calculate errors
errR1 = map(filtR1, ~ learnErrors(.x, multithread=TRUE,verbose = 0)),
derepF1 = map(filtF1, derepFastq), # dereplicate seqs
derepR1 = map(filtR1, derepFastq),
dadaF1 = map2(derepF1,errF1, ~ dada(.x, err = .y, multithread = TRUE)), # dada2
dadaR1 = map2(derepR1,errR1, ~ dada(.x, err = .y, multithread = TRUE)),
mergers = pmap(.l = list(dadaF1,derepF1, dadaR1,derepR1), # merge things
.f = mergePairs )) -> output.dada2
## Sample 1 - 429 reads in 220 unique sequences.
## Sample 1 - 516 reads in 261 unique sequences.
## Sample 1 - 650 reads in 267 unique sequences.
## Sample 1 - 735 reads in 287 unique sequences.
## Sample 1 - 698 reads in 200 unique sequences.
## Sample 1 - 722 reads in 294 unique sequences.
## Sample 1 - 736 reads in 289 unique sequences.
## Sample 1 - 666 reads in 258 unique sequences.
## Sample 1 - 719 reads in 369 unique sequences.
## Sample 1 - 636 reads in 270 unique sequences.
## Sample 1 - 732 reads in 328 unique sequences.
## Sample 1 - 674 reads in 268 unique sequences.
## Sample 1 - 573 reads in 268 unique sequences.
## Sample 1 - 651 reads in 292 unique sequences.
## Sample 1 - 655 reads in 288 unique sequences.
## Sample 1 - 429 reads in 311 unique sequences.
## Sample 1 - 516 reads in 361 unique sequences.
## Sample 1 - 650 reads in 411 unique sequences.
## Sample 1 - 735 reads in 436 unique sequences.
## Sample 1 - 698 reads in 376 unique sequences.
## Sample 1 - 722 reads in 479 unique sequences.
## Sample 1 - 736 reads in 460 unique sequences.
## Sample 1 - 666 reads in 415 unique sequences.
## Sample 1 - 719 reads in 493 unique sequences.
## Sample 1 - 636 reads in 453 unique sequences.
## Sample 1 - 732 reads in 467 unique sequences.
## Sample 1 - 674 reads in 428 unique sequences.
## Sample 1 - 573 reads in 426 unique sequences.
## Sample 1 - 651 reads in 469 unique sequences.
## Sample 1 - 655 reads in 464 unique sequences.
write_rds(output.dada2, path = "output.halfway.rds")
seqtabF <- makeSequenceTable(output.dada2$mergers)
dim(seqtabF)
## [1] 15 22
table(nchar(getSequences(seqtabF)))
##
## 200 302 304 330 331 342
## 1 12 1 1 1 6
Firstly, we’ll find the patterns that separate our Fwd, Rev, .1 and .2 files. look at the quality of the .1 and .2 reads Run dada 2 step by step
seqtab.nochim <- removeBimeraDenovo(seqtabF, method="consensus", multithread=TRUE)
dim(seqtab.nochim)
## [1] 15 15
seqtab.nochim.df <- as.data.frame(seqtab.nochim)
# Output files
conv_file <- file.path(output.dir,"hash_key.csv")
conv_file.fasta <- file.path(output.dir,"hash_key.fasta")
ASV_file <- file.path(output.dir,"ASV_table.csv")
# Now decide if you want hashing or not
if ( params$hash==TRUE)
{
conv_table <- tibble( Hash = "", Sequence ="")
hashes <- list(NULL)
######## Create the Hash key file
for (i in 1:ncol(seqtab.nochim.df)) { #for each column of the dataframe
current_seq <-colnames(seqtab.nochim.df)[i] # Take the whole sequence from each column of the dataframe
current_hash <- digest(current_seq,algo = "sha1",serialize = F,skip = "auto") # Hash it so it is both shorter and unique
hashes[[i]] = current_hash
conv_table [i,]<-c(current_hash, current_seq) # add the Hash - sequence conversion to a table
colnames(seqtab.nochim.df)[i] <- current_hash
}
write_csv(conv_table, conv_file) # write the table into a file
write.fasta(sequences = as.list(conv_table$Sequence),
names = as.list(conv_table$Hash),
file.out = conv_file.fasta)
seqtab.nochim.df <- bind_cols(sample.metadata %>%
select(Sample_name, Locus),
seqtab.nochim.df)
seqtab.nochim.df %>%
pivot_longer(cols = c(- Sample_name, - Locus),
names_to = "Hash",
values_to = "nReads") %>%
filter(nReads > 0) -> current_asv
write_csv(current_asv, ASV_file) }else{
#What do we do if you don't want hashes: two things - Change the header of the ASV table, write only one file
seqtab.nochim.df %>%
pivot_longer(cols = c(- Sample_name, - Locus),
names_to = "Sequence",
values_to = "nReads") %>%
filter(nReads > 0) -> current_asv
write_csv(current_asv, ASV_file)
}
##Track the fate of all reads
getN <- function(x) sum(getUniques(x))
output.dada2 %>%
select(-file1, -file2, -filtF1, -filtR1, -errF1, -errR1, -derepF1, -derepR1) %>%
mutate_at(.vars = c("dadaF1", "dadaR1", "mergers"),
~ sapply(.x,getN)) %>%
# pull(outFs) -> test
mutate(input = map_dbl(outFs, ~ .x[1]),
filtered = map_dbl(outFs, ~ .x[2]),
tabled = rowSums(seqtabF),
nonchim = rowSums(seqtab.nochim)) %>%
select(Sample_name,
Locus,
input,
filtered,
denoised_F = dadaF1,
denoised_R = dadaR1,
merged = mergers,
tabled,
nonchim) -> track
write_csv(track, file.path(params$output.folder,"dada2_summary.csv"))
kable (track, align= "c", format = "html") %>%
kable_styling(bootstrap_options= "striped", full_width = T, position = "center") %>%
column_spec(2, bold=T)
Sample_name | Locus | input | filtered | denoised_F | denoised_R | merged | tabled | nonchim |
---|---|---|---|---|---|---|---|---|
51530-026_S79 | 16S_Prey | 996 | 429 | 421 | 429 | 421 | 421 | 421 |
51530-027_S4 | 16S_Prey | 995 | 516 | 510 | 509 | 509 | 509 | 452 |
51530-029_S14 | 16S_Prey | 993 | 650 | 646 | 641 | 625 | 625 | 584 |
51530-031_S26 | 16S_Prey | 997 | 735 | 730 | 734 | 729 | 729 | 729 |
51530-032_S37 | 16S_Prey | 991 | 698 | 698 | 698 | 698 | 698 | 698 |
51530-033_S48 | 16S_Prey | 992 | 722 | 720 | 720 | 720 | 720 | 720 |
51530-034_S59 | 16S_Prey | 990 | 736 | 735 | 736 | 735 | 735 | 735 |
51530-035_S69 | 16S_Prey | 990 | 666 | 664 | 664 | 662 | 662 | 662 |
51530-036_S80 | 16S_Prey | 995 | 719 | 718 | 719 | 670 | 670 | 655 |
51530-037_S5 | 16S_Prey | 986 | 636 | 634 | 629 | 554 | 554 | 554 |
51530-038_S15 | 16S_Prey | 996 | 732 | 728 | 732 | 728 | 728 | 728 |
51530-039_S27 | 16S_Prey | 992 | 674 | 672 | 670 | 670 | 670 | 670 |
51530-040_S38 | 16S_Prey | 987 | 573 | 568 | 573 | 531 | 531 | 514 |
51530-041_S49 | 16S_Prey | 993 | 651 | 651 | 649 | 649 | 649 | 649 |
51530-042_S60 | 16S_Prey | 994 | 655 | 653 | 654 | 643 | 643 | 643 |
track %>%
mutate_if(is.numeric, as.integer) %>%
pivot_longer(cols = c(-Sample_name, -Locus),
names_to = "Step",
values_to = "Number of Sequences") %>%
mutate (Step = fct_relevel(Step,
levels = c( "input","filtered","denoised_F" ,"denoised_R" , "merged" , "tabled", "nonchim"))) %>%
ggplot(aes(x = Step, y = `Number of Sequences`, group = Sample_name, color = Sample_name)) +
geom_line() +
facet_wrap(~Locus) +
guides(color = "none")
## Warning: Outer names are only allowed for unnamed scalar atomic inputs