{
echo "#### Assign Variables ####"
echo ""
echo "# Data directories"
echo 'export timeseries_dir=/home/shared/8TB_HDD_02/shedurkin/timeseries_molecular'
echo 'export output_dir_top="${timeseries_dir}/D-Apul/output/03.00-D-Apul-RNAseq-gene-expression-DESeq2"'
echo ""
echo "# Output files"
echo 'export coldata="${output_dir_top}/DESeq2-coldata.tab"'
echo ""
echo "# Paths to programs"
echo 'export programs_dir="/home/shared"'
echo "# Set number of CPUs to use"
echo 'export threads=40'
echo ""
echo "# Print formatting"
echo 'export line="--------------------------------------------------------"'
echo ""
} > .bashvars
cat .bashvars
#### Assign Variables ####
# Data directories
export timeseries_dir=/home/shared/8TB_HDD_02/shedurkin/timeseries_molecular
export output_dir_top="${timeseries_dir}/D-Apul/output/03.00-D-Apul-RNAseq-gene-expression-DESeq2"
# Output files
export coldata="${output_dir_top}/DESeq2-coldata.tab"
# Paths to programs
export programs_dir="/home/shared"
# Set number of CPUs to use
export threads=40
# Print formatting
export line="--------------------------------------------------------"
# Define the output directory path
output_dir <- "../output/03.00-D-Apul-RNAseq-gene-expression-DESeq2/"
# Set desired false discovery rate threshold (i.e. adjusted p-value, padj)
fdr <- 0.05
# Set log2 fold change threshold (a value of '1' is equal to a fold change of '2')
log2fc <- 1
# Load bash variables into memory
source .bashvars
# Create output directory, if it doesn't exist
mkdir --parents "${output_dir_top}"
# Create associative array with sample and timepoint
metadata="../../M-multi-species/data/rna_metadata.csv"
# Create DESeq2-formatted coldata file
## Create header
printf "\t%s\t%s\n" "time.point" "colony.id"> "${coldata}"
## Read the metadata file line by line
while IFS=',' read -r sample_number sample_name plate well_number azenta_sample_name colony_id timepoint sample_type species_strain SampleBuffer; do
# Check if the species is "Acropora pulchra"
if [[ "${species_strain}" == "Acropora pulchra" ]]; then
printf "%s\t%s\t%s\n" "${azenta_sample_name}" "${timepoint}" "${colony_id}"
fi
done < <(tail -n +2 "${metadata}") \
| sort -k1,1 \
>> "${coldata}"
## Tab-delimited output of sample and timepoint
for sample in "${!sample_timepoint_map[@]}"
do
printf "%s\t%s\n" "$sample" "${sample_timepoint_map[$sample]}"
done | sort -k1,1 \
>> "${coldata}"
# Peek at output
head "${coldata}" | column -t
echo ""
echo "${line}"
echo ""
wc -l "${coldata}"
echo ""
echo "${line}"
echo ""
echo "Colony counts:"
echo ""
awk 'NR > 1 {print $3}' "${coldata}" | sort | uniq -c
gene.counts <- as.matrix(read.csv(file = "../output/02.20-D-Apul-RNAseq-alignment-HiSat2/apul-gene_count_matrix.csv", row.names="gene_id", check.names = FALSE))
coldata <- read.csv(file = "../output/03.00-D-Apul-RNAseq-gene-expression-DESeq2/DESeq2-coldata.tab", row.names=1, sep = "\t")
coldata$time.point <- factor(coldata$time.point)
head(gene.counts)
head(coldata)
all(rownames(coldata) == colnames(gene.counts))
dds <- DESeqDataSetFromMatrix(countData = gene.counts,
colData = coldata,
design = ~ time.point + colony.id)
dds
featureData <- data.frame(gene=rownames(gene.counts))
mcols(dds) <- DataFrame(mcols(dds), featureData)
mcols(dds)
dds$time.point <- factor(dds$time.point, levels = c("TP1","TP2", "TP3", "TP4"))
dds <- DESeq(dds)
tp1.v.tp2.results <- results(dds, contrast=c("time.point","TP1","TP2"), alpha = fdr, lfcThreshold = log2fc)
tp1.v.tp3.results <- results(dds, contrast=c("time.point","TP1","TP3"), alpha = fdr, lfcThreshold = log2fc)
tp1.v.tp4.results <- results(dds, contrast=c("time.point","TP1","TP4"), alpha = fdr, lfcThreshold = log2fc)
tp2.v.tp3.results <- results(dds, contrast=c("time.point","TP2","TP3"), alpha = fdr, lfcThreshold = log2fc)
tp2.v.tp4.results <- results(dds, contrast=c("time.point","TP2","TP4"), alpha = fdr, lfcThreshold = log2fc)
tp3.v.tp4.results <- results(dds, contrast=c("time.point","TP3","TP4"), alpha = fdr, lfcThreshold = log2fc)
tp2.v.tp4.results
summary(tp2.v.tp4.results)
table(tp2.v.tp4.results$padj < 0.05)
# Create a named list of the data frames
results_list <- list(
tp1.v.tp2.results = tp1.v.tp2.results,
tp1.v.tp3.results = tp1.v.tp3.results,
tp1.v.tp4.results = tp1.v.tp4.results,
tp2.v.tp3.results = tp2.v.tp3.results,
tp2.v.tp4.results = tp2.v.tp4.results,
tp3.v.tp4.results = tp3.v.tp4.results
)
# Loop through the list and write each data frame to a CSV file in the specified directory
for (df_name in names(results_list)) {
write.csv(results_list[[df_name]], file = paste0(output_dir, df_name, ".table.csv"), row.names = TRUE, quote = FALSE)
}
vsd <- varianceStabilizingTransformation(dds, blind=FALSE)
NOTE: Hover over points to see the sample numbers
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( vsd$colony.id, vsd$time.point, sep = " - " )
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
col = colors)
Visualize sample clustering via PCA (after transformation)
# PCA with points color coded by time point
plotPCA(vsd, intgroup = c("time.point"))
# PCA with points color coded by colony ID
plotPCA(vsd, intgroup = c("colony.id"))
Time points 1 and 4 are clustering together, and time points 2 and 3 are clustering together. It also looks like colonies cluster somewhat.
top_20_counts_all <- order(rowMeans(counts(dds,normalized=TRUE)),
decreasing=TRUE)[1:200]
timepoint_annotation = colData(dds) %>% as.data.frame() %>% select(time.point)
pheatmap(assay(vsd)[top_20_counts_all,],
cluster_rows=FALSE,
show_rownames=FALSE,
cluster_cols=TRUE,
annotation_col = timepoint_annotation)