---
title: "Yellow Island eDNA"
output: html_document
---
#this is a test run of Rachel's code for decomtamination of the eDNA data. It's the next step in the pipeline after taxonomic assignment.
#get decontam installed if you haven't
install.packages(devtools)
library(devtools)
install.packages("vctrs")
devtools::install_github("benjjneb/decontam")
#trying to install decom
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("decontam")
library(decontam)
library(phyloseq)
library(ranacapa)
library(devtools)
library(ggplot2)
library(vegan)
#getwd()
#set working directory
3setwd("/Users/cmantegna/Documents/YellowIsland2023")
#read in files
18S_30<-read.csv("~/Documents/YellowIsland2023/data/Tronko_Results/Yellow_Sep5_q35_18S_Max5.txt", header=TRUE, sep="\t")
18S_5<-read.csv( /data/Tronko_Results/Yellow_Sep5_q35_18S_Max5.txt, header=TRUE, sep="\t")
CO1_30<-read.csv( /data/Tronko_Results/Yellow_Sep5_q35_CO1_Max30.txt, header=TRUE, sep="\t")
CO1_5<-read.csv( /data/Tronko_Results/Yellow_Sep5_q35_CO1_Max5.txt, header=TRUE, sep="\t")
biom<-read.csv("ByCatch_MetadataOutput_Metabarcoding.txt", header=TRUE, sep="\t")
library(phyloseq)
library(ranacapa)
library(devtools)
library(ggplot2)
library(vegan)
#Import some ranacapa functions
convert_anacapa_to_phyloseq <- function(taxon_table, metadata_file) {
# Validate the files
validate_input_files(taxon_table, metadata_file)
# Group the anacapa ouptut by taxonomy, if it has not yet happened, and turn it into a matrix
taxon_table2 <- group_anacapa_by_taxonomy(taxon_table) %>%
tibble::column_to_rownames("sum.taxonomy") %>%
as.matrix
# Reorder the columns (sites) for ease of displaying later
taxon_table2 <- taxon_table2[ , order(colnames(taxon_table2))]
# Convert the matrix into a phyloseq otu_table object, with taxa as the rows
ana_taxon_table_physeq <- phyloseq::otu_table(taxon_table2, taxa_are_rows = TRUE)
# Extract the rownames of the matrix above- this has the full taxonomic path.
# Split the taxonomic path on semicolons, and turn the resulting matrix into
# a phyloseq tax_table object
taxon_names <- reshape2::colsplit(rownames(taxon_table2), ";",
names = c("Domain","Phylum","Class","Order","Family","Genus","Species")) %>%
as.matrix
rownames(taxon_names) <- rownames(taxon_table2)
tax_physeq <- phyloseq::tax_table(taxon_names)
colnames(tax_physeq) <- c("Domain","Phylum","Class","Order","Family","Genus","Species")
# Make a phyloseq object out of the otu_table and the tax_table objects
physeq_object <- phyloseq::phyloseq(ana_taxon_table_physeq, tax_physeq)
# Make sure the mapping file (ie the site metadata) is ordered according to site name
rownames(metadata_file) <- metadata_file[, 1]
metadata_file <- metadata_file[order(metadata_file[, 1]), ]
# Convert the mapping file into a phyloseq sample_data object, and merge it with the
# phyloseq object created above to make a phyloseq object with otu table, tax table, and sample data.
sampledata <- phyloseq::sample_data(metadata_file)
phyloseq::merge_phyloseq(physeq_object, sampledata)
}
vegan_otu <- function(physeq_object) {
OTU <- phyloseq::otu_table(physeq_object)
if (phyloseq::taxa_are_rows(OTU)) {
OTU <- t(OTU)
}
return(methods::as(OTU, "matrix"))
}
#now make a phyloseq object
physeqCIB_FITS_30<-convert_anacapa_to_phyloseq(CIB_FITS_30,biom)
physeqCIB_FITS_5<-convert_anacapa_to_phyloseq(CIB_FITS_5,biom)
physeqCIB_CO1_30<-convert_anacapa_to_phyloseq(CIB_CO1_30,biom)
physeqCIB_CO1_5<-convert_anacapa_to_phyloseq(CIB_CO1_5,biom)
#summary_taxa_decontam
ntaxa(physeqCIB_FITS_30)
ntaxa(physeqCIB_FITS_5)
ntaxa(physeqCIB_CO1_30)
ntaxa(physeqCIB_CO1_5)
#decontaminate CO1_5
table(sample_data(physeqCIB_CO1_5)[,"Sample_or_Control"])
df <- as.data.frame(sample_data(physeqCIB_CO1_5))
library(ggplot2)
library(ggpubr)
library(decontam)
df$LibrarySize <- sample_sums(physeqCIB_CO1_5)
df <- df[order(df$LibrarySize),]
df$Index <- seq(nrow(df))
p<-ggplot(data=df, aes(x=Index, y=LibrarySize, color=Sample_or_Control)) + geom_point()
ggsave("physeqCIB_CO1_5_libsizes_predecontam.pdf",p, width=6, height=5, units="cm", scale=3, limitsize = FALSE)
#try 0.1 decontam
sample_data(physeqCIB_CO1_5)$Sample_or_Control == "Control"
sample_data(physeqCIB_CO1_5)$is.neg <- sample_data(physeqCIB_CO1_5)$Sample_or_Control == "Control"
contamdf.prev01 <- isContaminant(physeqCIB_CO1_5, method="prevalence", neg="is.neg", threshold=0.1)
head(which(contamdf.prev01$contaminant))
table(contamdf.prev01$contaminant)
physeqCIB_CO1_5_dc<-prune_taxa(!contamdf.prev01$contaminant, physeqCIB_CO1_5)
df <- as.data.frame(sample_data(physeqCIB_CO1_5_dc))
df$LibrarySize <- sample_sums(physeqCIB_CO1_5_dc)
df <- df[order(df$LibrarySize),]
df$Index <- seq(nrow(df))
p<-ggplot(data=df, aes(x=Index, y=LibrarySize, color=Sample_or_Control)) + geom_point()
ggsave("physeqCIB_CO1_5_libsizes_postdecontam0.1.pdf",p, width=6, height=5, units="cm", scale=3, limitsize = FALSE)
physeqCIB_CO1_5_dct = filter_taxa(physeqCIB_CO1_5_dc,function(x) sum(x > 0) > 1, TRUE)
write.table(data.frame("sum.taxonomy"=rownames(otu_table(physeqCIB_CO1_5_dct)),otu_table(physeqCIB_CO1_5_dct)),"physeqCIB_CO1_5_dct_0.1.txt", row.names=FALSE, sep="\t",quote = FALSE)
#decontaminate FITS_5
table(sample_data(physeqCIB_FITS_5)[,"Sample_or_Control"])
df <- as.data.frame(sample_data(physeqCIB_FITS_5))
library(ggplot2)
library(ggpubr)
library(decontam)
df$LibrarySize <- sample_sums(physeqCIB_FITS_5)
df <- df[order(df$LibrarySize),]
df$Index <- seq(nrow(df))
p<-ggplot(data=df, aes(x=Index, y=LibrarySize, color=Sample_or_Control)) + geom_point()
ggsave("physeqCIB_FITS_5_libsizes_predecontam.pdf",p, width=6, height=5, units="cm", scale=3, limitsize = FALSE)
#try 0.1 decontam
sample_data(physeqCIB_FITS_5)$Sample_or_Control == "Control"
sample_data(physeqCIB_FITS_5)$is.neg <- sample_data(physeqCIB_FITS_5)$Sample_or_Control == "Control"
contamdf.prev01 <- isContaminant(physeqCIB_FITS_5, method="prevalence", neg="is.neg", threshold=0.1)
head(which(contamdf.prev01$contaminant))
table(contamdf.prev01$contaminant)
physeqCIB_FITS_5_dc<-prune_taxa(!contamdf.prev01$contaminant, physeqCIB_FITS_5)
df <- as.data.frame(sample_data(physeqCIB_FITS_5_dc))
df$LibrarySize <- sample_sums(physeqCIB_FITS_5_dc)
df <- df[order(df$LibrarySize),]
df$Index <- seq(nrow(df))
p<-ggplot(data=df, aes(x=Index, y=LibrarySize, color=Sample_or_Control)) + geom_point()
ggsave("physeqCIB_FITS_5_libsizes_postdecontam0.1.pdf",p, width=6, height=5, units="cm", scale=3, limitsize = FALSE)
physeqCIB_FITS_5_dct = filter_taxa(physeqCIB_FITS_5_dc,function(x) sum(x > 0) > 1, TRUE)
write.table(data.frame("sum.taxonomy"=rownames(otu_table(physeqCIB_FITS_5_dct)),otu_table(physeqCIB_FITS_5_dct)),"physeqCIB_FITS_5_dct_0.1.txt", row.names=FALSE, sep="\t",quote = FALSE)
#decontaminate CO1_30
table(sample_data(physeqCIB_CO1_30)[,"Sample_or_Control"])
df <- as.data.frame(sample_data(physeqCIB_CO1_30))
library(ggplot2)
library(ggpubr)
library(decontam)
df$LibrarySize <- sample_sums(physeqCIB_CO1_30)
df <- df[order(df$LibrarySize),]
df$Index <- seq(nrow(df))
p<-ggplot(data=df, aes(x=Index, y=LibrarySize, color=Sample_or_Control)) + geom_point()
ggsave("physeqCIB_CO1_30_libsizes_predecontam.pdf",p, width=6, height=5, units="cm", scale=3, limitsize = FALSE)
#try 0.1 decontam
sample_data(physeqCIB_CO1_30)$Sample_or_Control == "Control"
sample_data(physeqCIB_CO1_30)$is.neg <- sample_data(physeqCIB_CO1_30)$Sample_or_Control == "Control"
contamdf.prev01 <- isContaminant(physeqCIB_CO1_30, method="prevalence", neg="is.neg", threshold=0.1)
head(which(contamdf.prev01$contaminant))
table(contamdf.prev01$contaminant)
physeqCIB_CO1_30_dc<-prune_taxa(!contamdf.prev01$contaminant, physeqCIB_CO1_30)
df <- as.data.frame(sample_data(physeqCIB_CO1_30_dc))
df$LibrarySize <- sample_sums(physeqCIB_CO1_30_dc)
df <- df[order(df$LibrarySize),]
df$Index <- seq(nrow(df))
p<-ggplot(data=df, aes(x=Index, y=LibrarySize, color=Sample_or_Control)) + geom_point()
ggsave("physeqCIB_CO1_30_libsizes_postdecontam0.1.pdf",p, width=6, height=5, units="cm", scale=3, limitsize = FALSE)
physeqCIB_CO1_30_dct = filter_taxa(physeqCIB_CO1_30_dc,function(x) sum(x > 0) > 1, TRUE)
write.table(data.frame("sum.taxonomy"=rownames(otu_table(physeqCIB_CO1_30_dct)),otu_table(physeqCIB_CO1_30_dct)),"physeqCIB_CO1_30_dct_0.1.txt", row.names=FALSE, sep="\t",quote = FALSE)
#decontaminate FITS_30
table(sample_data(physeqCIB_FITS_30)[,"Sample_or_Control"])
df <- as.data.frame(sample_data(physeqCIB_FITS_30))
library(ggplot2)
library(ggpubr)
library(decontam)
df$LibrarySize <- sample_sums(physeqCIB_FITS_30)
df <- df[order(df$LibrarySize),]
df$Index <- seq(nrow(df))
p<-ggplot(data=df, aes(x=Index, y=LibrarySize, color=Sample_or_Control)) + geom_point()
ggsave("physeqCIB_FITS_30_libsizes_predecontam.pdf",p, width=6, height=5, units="cm", scale=3, limitsize = FALSE)
#try 0.1 decontam
sample_data(physeqCIB_FITS_30)$Sample_or_Control == "Control"
sample_data(physeqCIB_FITS_30)$is.neg <- sample_data(physeqCIB_FITS_30)$Sample_or_Control == "Control"
contamdf.prev01 <- isContaminant(physeqCIB_FITS_30, method="prevalence", neg="is.neg", threshold=0.1)
head(which(contamdf.prev01$contaminant))
table(contamdf.prev01$contaminant)
physeqCIB_FITS_30_dc<-prune_taxa(!contamdf.prev01$contaminant, physeqCIB_FITS_30)
df <- as.data.frame(sample_data(physeqCIB_FITS_30_dc))
df$LibrarySize <- sample_sums(physeqCIB_FITS_30_dc)
df <- df[order(df$LibrarySize),]
df$Index <- seq(nrow(df))
p<-ggplot(data=df, aes(x=Index, y=LibrarySize, color=Sample_or_Control)) + geom_point()
ggsave("physeqCIB_FITS_30_libsizes_postdecontam0.1.pdf",p, width=6, height=5, units="cm", scale=3, limitsize = FALSE)
physeqCIB_FITS_30_dct = filter_taxa(physeqCIB_FITS_30_dc,function(x) sum(x > 0) > 1, TRUE)
write.table(data.frame("sum.taxonomy"=rownames(otu_table(physeqCIB_FITS_30_dct)),otu_table(physeqCIB_FITS_30_dct)),"physeqCIB_FITS_30_dct_0.1.txt", row.names=FALSE, sep="\t",quote = FALSE)
#remove blanks from decontaminated phyloseq objects
#remove blanks, make min 5
physeqCIB_CO1_30_dct_noblanks = subset_samples(physeqCIB_CO1_30_dct, Sample_or_Control == "Sample")
physeqCIB_CO1_5_dct_noblanks = subset_samples(physeqCIB_CO1_5_dct, Sample_or_Control == "Sample")
physeqCIB_FITS_30_dct_noblanks = subset_samples(physeqCIB_FITS_30_dct, Sample_or_Control == "Sample")
physeqCIB_FITS_5_dct_noblanks = subset_samples(physeqCIB_FITS_5_dct, Sample_or_Control == "Sample")
physeqCIB_CO1_30_dct_noblanks_min5 = prune_taxa(taxa_sums(physeqCIB_CO1_30_dct_noblanks) > 4, physeqCIB_CO1_30_dct_noblanks)
write.table(data.frame("sum.taxonomy"=rownames(otu_table(physeqCIB_CO1_30_dct_noblanks_min5)),otu_table(physeqCIB_CO1_30_dct_noblanks_min5)),"physeqCIB_CO1_30_dct_noblanks_min5.txt", row.names=FALSE, sep="\t",quote = FALSE)
saveRDS(physeqCIB_CO1_30_dct_noblanks_min5, "physeqCIB_CO1_30_dct_noblanks_min5.RDS")
physeqCIB_CO1_5_dct_noblanks_min5 = prune_taxa(taxa_sums(physeqCIB_CO1_5_dct_noblanks) > 4, physeqCIB_CO1_5_dct_noblanks)
write.table(data.frame("sum.taxonomy"=rownames(otu_table(physeqCIB_CO1_5_dct_noblanks_min5)),otu_table(physeqCIB_CO1_5_dct_noblanks_min5)),"physeqCIB_CO1_5_dct_noblanks_min5.txt", row.names=FALSE, sep="\t",quote = FALSE)
saveRDS(physeqCIB_CO1_5_dct_noblanks_min5, "physeqCIB_CO1_5_dct_noblanks_min5.RDS")
physeqCIB_FITS_5_dct_noblanks_min5 = prune_taxa(taxa_sums(physeqCIB_FITS_5_dct_noblanks) > 4, physeqCIB_FITS_5_dct_noblanks)
write.table(data.frame("sum.taxonomy"=rownames(otu_table(physeqCIB_FITS_5_dct_noblanks_min5)),otu_table(physeqCIB_FITS_5_dct_noblanks_min5)),"physeqCIB_FITS_5_dct_noblanks_min5.txt", row.names=FALSE, sep="\t",quote = FALSE)
saveRDS(physeqCIB_FITS_5_dct_noblanks_min5, "physeqCIB_FITS_5_dct_noblanks_min5.RDS")
physeqCIB_FITS_30_dct_noblanks_min5 = prune_taxa(taxa_sums(physeqCIB_FITS_30_dct_noblanks) > 4, physeqCIB_FITS_30_dct_noblanks)
write.table(data.frame("sum.taxonomy"=rownames(otu_table(physeqCIB_FITS_30_dct_noblanks_min5)),otu_table(physeqCIB_FITS_30_dct_noblanks_min5)),"physeqCIB_FITS_30_dct_noblanks_min5.txt", row.names=FALSE, sep="\t",quote = FALSE)
saveRDS(physeqCIB_FITS_30_dct_noblanks_min5, "physeqCIB_FITS_30_dct_noblanks_min5.RDS")
physeqCIB_CO1_5_dct_noblanks_min5_Arthropoda<-subset_taxa(physeqCIB_CO1_5_dct_noblanks_min5, Domain=="Arthropoda")
physeqCIB_CO1_30_dct_noblanks_min5_Arthropoda<-subset_taxa(physeqCIB_CO1_30_dct_noblanks_min5, Domain=="Arthropoda")
### rarefaction
##how to subset by a metadata column, in this case location
physeqCIB_CO1_5_dct_noblanks_min5_Arthropoda_CDFA = subset_samples(physeqCIB_CO1_5_dct_noblanks_min5_Arthropoda, Site == "CDFA")
physeqCIB_CO1_5_dct_noblanks_min5_Arthropoda_SDNHM = subset_samples(physeqCIB_CO1_5_dct_noblanks_min5_Arthropoda, Site == "SD Natural History Museum")
physeqCIB_CO1_5_dct_noblanks_min5_Arthropoda_SSI = subset_samples(physeqCIB_CO1_5_dct_noblanks_min5_Arthropoda, Site == "Sierra Streams Institute ")
write.table(data.frame("sum.taxonomy"=rownames(otu_table(physeqCIB_CO1_5_dct_noblanks_min5_Arthropoda_CDFA)),otu_table(physeqCIB_CO1_5_dct_noblanks_min5_Arthropoda_CDFA)),"physeqCIB_CO1_5_dct_noblanks_min5_Arthropoda_CDFA.txt", row.names=FALSE, sep="\t",quote = FALSE)
write.table(data.frame("sum.taxonomy"=rownames(otu_table(physeqCIB_CO1_5_dct_noblanks_min5_Arthropoda_SDNHM)),otu_table(physeqCIB_CO1_5_dct_noblanks_min5_Arthropoda_SDNHM)),"physeqCIB_CO1_5_dct_noblanks_min5_Arthropoda_SDNHM.txt", row.names=FALSE, sep="\t",quote = FALSE)
write.table(data.frame("sum.taxonomy"=rownames(otu_table(physeqCIB_CO1_5_dct_noblanks_min5_Arthropoda_SSI)),otu_table(physeqCIB_CO1_5_dct_noblanks_min5_Arthropoda_SSI)),"physeqCIB_CO1_5_dct_noblanks_min5_Arthropoda_SSI.txt", row.names=FALSE, sep="\t",quote = FALSE)