--- title: "RNAseq gene expression" author: "Ariana S Huffmyer" date: "2023" output: html_document editor_options: chunk_output_type: console --- # Set up ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) # The following setting is important, do not omit. options(stringsAsFactors = FALSE) #Set Strings to character ``` # Load libraries ```{r} if ("tidyverse" %in% rownames(installed.packages()) == 'FALSE') install.packages('tidyverse') if ("genefilter" %in% rownames(installed.packages()) == 'FALSE') BiocManager::install("genefilter") if ("DESeq2" %in% rownames(installed.packages()) == 'FALSE') BiocManager::install('DESeq2') if ("RColorBrewer" %in% rownames(installed.packages()) == 'FALSE') install.packages('RColorBrewer') if ("WGCNA" %in% rownames(installed.packages()) == 'FALSE') install.packages('WGCNA') if ("flashClust" %in% rownames(installed.packages()) == 'FALSE') install.packages('flashClust') if ("gridExtra" %in% rownames(installed.packages()) == 'FALSE') install.packages('gridExtra') if ("ComplexHeatmap" %in% rownames(installed.packages()) == 'FALSE') BiocManager::install('ComplexHeatmap') if ("goseq" %in% rownames(installed.packages()) == 'FALSE') BiocManager::install('goseq') if ("dplyr" %in% rownames(installed.packages()) == 'FALSE') install.packages('dplyr') if ("clusterProfiler" %in% rownames(installed.packages()) == 'FALSE') BiocManager::install('clusterProfiler') if ("pheatmap" %in% rownames(installed.packages()) == 'FALSE') install.packages('pheatmap') if ("magrittr" %in% rownames(installed.packages()) == 'FALSE') install.packages('magrittr') if ("vegan" %in% rownames(installed.packages()) == 'FALSE') install.packages('vegan') if ("factoextra" %in% rownames(installed.packages()) == 'FALSE') install.packages('factoextra') library("tidyverse") library("genefilter") library("DESeq2") library("RColorBrewer") library("WGCNA") library("flashClust") library("gridExtra") library("ComplexHeatmap") library("goseq") library("dplyr") library("clusterProfiler") library("pheatmap") library("magrittr") library("vegan") library("factoextra") library("dplyr") library("viridis") ``` # Load data and metadata Load metadata and gene count matrix. ```{r} #load metadata sheet with sample name and treatment information metadata <- read.csv("A-Pver/data/rna-seq/metadata_NutrientEnrichment_Pverr_RNASeq.csv", header = TRUE, sep = ",") metadata<-metadata%>% select(SampleID, Block, FragmentID, Treatment, SRR) head(metadata) #load gene count matrix generated from cluster computation gcount <- as.data.frame(read.csv("A-Pver/data/rna-seq/Pverr_gene_count_matrix.csv", row.names="gene_id"), colClasses = double) head(gcount) names(gcount) ``` Match file name with SRR number to instead be the Fragment ID. ```{r} names(gcount)<-gsub("^[^.]*\\.|_.*$", "", names(gcount)) #restructure column names names(gcount) <- metadata$FragmentID[match(names(gcount), metadata$SRR)] ``` # Filter data *Remove 0 count genes*: Check that there are no genes with 0 counts across all samples. ```{r} nrow(gcount) gcount<-gcount %>% mutate(Total = rowSums(.[, 1:32]))%>% filter(!Total==0)%>% dplyr::select(!Total) nrow(gcount) ``` We had 28,111 genes, which was filtered down to 25,172 by removing genes with row sums of 0 (those not detected at all in our sequences). *pOverA*: Specifying the minimum count for a proportion of samples for each gene. Here, we are using a pOverA of 0.5, 10. This is because we have 32 samples with a minimum of n=1 sample per fragment and half in enriched and half in control. Therefore, we will accept genes that are present in 16/32 = 0.5 of the samples because we may expect different expression by treatment We are further setting the minimum count of genes to 10, such that 50% of the samples must have a gene count of >10 in order for the gene to remain in the data set. *Note: Danielle used a PoverA of 0.90,10 - not sure why.* Filter in the package "genefilter". Pre-filtering our dataset to reduce the memory size dataframe, increase the speed of the transformation and testing functions, and improve quality of statistical analysis by removing low-coverage counts. ```{r} filt <- filterfun(pOverA(0.5,10)) #create filter for the counts data gfilt <- genefilter(gcount, filt) #identify genes to keep by count filter gkeep <- gcount[gfilt,] #identify genes to keep by count filter gkeep <- gcount[gfilt,] #identify gene lists gn.keep <- rownames(gkeep) #gene count data filtered in PoverA, P percent of the samples have counts over A gcount_filt <- as.data.frame(gcount[which(rownames(gcount) %in% gn.keep),]) #How many rows do we have before and after filtering? nrow(gcount) #Before nrow(gcount_filt) #After ``` Before filtering we had 25,172 genes and after we have 20,326 genes. Check that row and column names match from metadata to the count matrix in the correct order. Display current order of metadata and gene count matrix. ```{r} metadata$FragmentID colnames(gcount_filt) ``` These are not in the same order, re order them. ```{r} list<-colnames(gcount_filt) list<-as.factor(list) metadata$FragmentID<-as.factor(metadata$FragmentID) # Re-order the levels metadata$FragmentID <- factor(as.character(metadata$FragmentID), levels=list) # Re-order the data.frame metadata_ordered <- metadata[order(metadata$FragmentID),] metadata_ordered$FragmentID metadata_ordered<-metadata_ordered%>% dplyr::select(FragmentID, Block, Treatment) ``` Check the order now that it has been corrected. ```{r} metadata_ordered$FragmentID colnames(gcount_filt) ``` These now match. # Contruct DESeq2 dataset ```{r} #Set DESeq2 design gdds <- DESeqDataSetFromMatrix(countData = gcount_filt, colData = metadata_ordered, design = ~Treatment) ``` *check that Danielle's data is properly ordered - this could be a reason our data do not match* Conduct VST transformation. ```{r} SF.gdds <- estimateSizeFactors(gdds) #estimate size factors to determine if we can use vst to transform our data. Size factors should be less than 4 for us to use vst print(sizeFactors(SF.gdds)) #View size factors ``` All size factors are less than 4, so we can use VST transformation. ```{r} gvst <- vst(gdds, blind=FALSE) #apply a variance stabilizing transformation to minimize effects of small counts and normalize wrt library size head(assay(gvst), 3) #view transformed gene count data for the first three genes in the dataset. ``` # Conduct PERMANOVA and PCA Export data for PERMANOVA test. ```{r} test<-t(assay(gvst)) #export as matrix test<-as.data.frame(test) #add category columns test$FragmentID<-rownames(test) test$Treatment<-metadata_ordered$Treatment[match(test$FragmentID, metadata_ordered$FragmentID)] ``` Build PERMANOVA model. ```{r} scaled_test <-prcomp(test[c(1:20326)], scale=TRUE, center=TRUE) fviz_eig(scaled_test) # scale data vegan <- scale(test[c(1:20326)]) # PerMANOVA permanova<-adonis2(vegan ~ Treatment, data = test, method='eu') permanova ``` Gene expression is significantly different between Treatments (p=0.023). Permutation test for adonis under reduced model Terms added sequentially (first to last) Permutation: free Number of permutations: 999 adonis2(formula = vegan ~ Treatment, data = test, method = "eu") Df SumOfSqs R2 F Pr(>F) Treatment 1 33129 0.05258 1.6648 0.023 * Residual 30 596977 0.94742 Total 31 630106 1.00000 Plot a heatmap to sample to sample distances ```{r} gsampleDists <- dist(t(assay(gvst))) #calculate distance matix gsampleDistMatrix <- as.matrix(gsampleDists) #distance matrix rownames(gsampleDistMatrix) <- colnames(gvst) #assign row names colnames(gsampleDistMatrix) <- NULL #assign col names colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) #assign colors save_pheatmap_pdf <- function(x, filename, width=7, height=7) { stopifnot(!missing(x)) stopifnot(!missing(filename)) pdf(filename, width=width, height=height) grid::grid.newpage() grid::grid.draw(x$gtable) dev.off() } pht<-pheatmap(gsampleDistMatrix, #plot matrix clustering_distance_rows=gsampleDists, #cluster rows clustering_distance_cols=gsampleDists, #cluster columns col=colors) #set colors save_pheatmap_pdf(pht, "A-Pver/output/rna-seq/sample_distance_heatmap.pdf") ``` Plot a PCA of samples by Treatment. ```{r} gPCAdata <- plotPCA(gvst, intgroup = c("Treatment"), returnData=TRUE, ntop=20326) #use ntop to specify all genes percentVar <- round(100*attr(gPCAdata, "percentVar")) #plot PCA of samples with all data allgenesfilt_PCA <- ggplot(gPCAdata, aes(PC1, PC2, color=Treatment)) + geom_point(size=3) + xlab(paste0("PC1: ",percentVar[1],"% variance")) + ylab(paste0("PC2: ",percentVar[2],"% variance")) + coord_fixed()+ theme_classic() + #Set background color theme(panel.border = element_blank(), # Set border #panel.grid.major = element_blank(), #Set major gridlines #panel.grid.minor = element_blank(), #Set minor gridlines axis.line = element_line(colour = "black"), #Set axes color plot.background=element_blank()) #Set the plot background allgenesfilt_PCA ggsave("A-Pver/output/rna-seq/treatment_pca.png", allgenesfilt_PCA, width=6, height=4) ``` Confirm that PCA viaualization in DESeq is the same. ```{r} plotPCA(gvst, intgroup = "Treatment") ``` Yes, these visualizations are the same as manual method above. # View Differential Expression Examine differential expressed genes. ## Run without shrinkage Run differential expression analysis between treatment and alpha set at 0.05. ```{r} degenes<-DESeq(gdds) degenes<-results(degenes, contrast=c("Treatment","Control","Enriched"), alpha=0.1) #order by p value degenes <- degenes[order(degenes$padj),] head(degenes) summary(degenes) sum(degenes$padj < 0.05, na.rm=TRUE) ``` There are 175 up regulated genes and 339 down regulated genes between our treatments. There are 0 outliers and 789 genes with low counts. There are 233 genes with an adjusted p-value of <0.05. Plot log fold change. ```{r} plotMA(degenes, ylim=c(-8,8), alpha = 0.05, colSig="red") ``` ```{r} plotCounts(gdds, gene=which.min(degenes$padj), intgroup="Treatment") ``` ## Next, run results with shrinkage ```{r} degenes_shrink<-DESeq(gdds) degenes_shrink<-lfcShrink(degenes_shrink, coef=2, type="normal", alpha=0.1) #order by p value degenes_shrink <- degenes_shrink[order(degenes_shrink$padj),] head(degenes_shrink) summary(degenes_shrink) sum(degenes_shrink$padj < 0.05, na.rm=TRUE) ``` Results are the same between methods. I was not able to set alpha in the shrinkage method. for now I will proceed without shrinkage and we can return to this step. ## Export data ```{r} write.csv(as.data.frame(degenes), file="A-Pver/output/rna-seq/treatment_deg_results.csv") ``` # Plot DEG heatmap Run differential expression test using a Wald model on DEGs. Extract DEGs that are padj<0.05 and log2fold change >1. ```{r, message = FALSE} degenes<-DESeq(gdds) DEG.results.all <- results(degenes) DEG.results.all<-as.data.frame(DEG.results.all) DEG.results.all <- as.data.frame(subset(DEG.results.all, padj<0.05)) DEG.results.all <- as.data.frame(subset(DEG.results.all, abs(log2FoldChange)>1)) dim(DEG.results.all) DEGlist <- gdds[rownames(DEG.results.all)] DEGvst <- varianceStabilizingTransformation(DEGlist) ``` Plot a heatmap of DEG expression. ```{r} df <- as.data.frame(colData(DEGvst)[,c("Treatment")]) names(df)<-"Treatment" rownames(df) <- colnames(DEGlist) df_DEGSeq2 <- as.data.frame(colData(DEGvst)[c("Treatment")]) #make dataframe for column naming and associated treatment pdf("A-Pver/output/rna-seq/DEG_pheatmap.pdf") pheatmap(assay(DEGvst), scale= "row", legend=TRUE, annotation_legend=TRUE, annotation_col=df_DEGSeq2, clustering_distance_rows="euclidean", clustering_method = "average", show_rownames =FALSE, show_colnames =TRUE, cluster_cols = TRUE) dev.off() ``` These results are quite different from Danielle's: https://github.com/hputnam/Becker_E5/blob/master/RAnalysis/Scripts/RNA-seq/Host/Host_Differential_Gene_Expression_Analysis.Rmd. We need to figure out why. This analysis doesn't show much difference in treatments, but Danielle's figures show more difference. # WGCNA analysis to treatment ## WGCNA analysis using Dynamic Tree Cut This code uses step by step network analysis and module detection based on scripts from https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/index.html. Transpose the filtered gene count matrix so that the gene IDs are rows and the sample IDs are columns. ```{r} datExpr <- as.data.frame(t(assay(gvst))) #transpose to output to a new data frame with the column names as row names. And make all data numeric ``` Check for genes and samples with too many missing values with goodSamplesGenes. There shouldn't be any because we performed pre-filtering ```{r} gsg = goodSamplesGenes(datExpr, verbose = 3) gsg$allOK #Should return TRUE if not, the R chunk below will take care of flagged data ``` Remove flagged samples if the allOK is FALSE, not used here. ```{r} #ncol(datExpr) #number genes before #if (!gsg$allOK) #If the allOK is FALSE... #{ # Optionally, print the gene and sample names that are flagged: #if (sum(!gsg$goodGenes)>0) #printFlush(paste("Removing genes:", paste(names(datExpr)[!gsg$goodGenes], collapse = ", "))); #if (sum(!gsg$goodSamples)>0) #printFlush(paste("Removing samples:", paste(rownames(datExpr)[!gsg$goodSamples], collapse = ", "))); # Remove the offending genes and samples from the data: #datExpr = datExpr[gsg$goodSamples, gsg$goodGenes] #} #ncol(datExpr) #number genes after ``` Look for outliers by examining tree of samples ```{r} sampleTree = hclust(dist(datExpr), method = "average"); # Plot the sample tree: Open a graphic output window of size 12 by 9 inches # The user should change the dimensions if the window is too large or too small. pdf("A-Pver/output/rna-seq/wgcna_outliers.pdf") plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, cex.axis = 1.5, cex.main = 2) dev.off() ``` There don't look to be any outliers, so we will move on with business as usual. ## Network construction and consensus module detection ### Choosing a soft-thresholding power: Analysis of network topology β The soft thresholding power (β) is the number to which the co-expression similarity is raised to calculate adjacency. The function pickSoftThreshold performs a network topology analysis. The user chooses a set of candidate powers, however the default parameters are suitable values. ```{r, message=FALSE, warning=FALSE} allowWGCNAThreads() # # Choose a set of soft-thresholding powers powers <- c(seq(from = 1, to=19, by=2), c(21:30)) #Create a string of numbers from 1 through 10, and even numbers from 10 through 20 # # # Call the network topology analysis function sft <-pickSoftThreshold(datExpr, powerVector = powers, verbose = 5) ``` Plot the results. ```{r} sizeGrWindow(9, 5) par(mfrow = c(1,2)); cex1 = 0.9; # # # Scale-free topology fit index as a function of the soft-thresholding power plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n", main = paste("Scale independence")); text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels=powers,cex=cex1,col="red"); # # # this line corresponds to using an R^2 cut-off abline(h=0.9,col="red") # # # Mean connectivity as a function of the soft-thresholding power plot(sft$fitIndices[,1], sft$fitIndices[,5], xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n", main = paste("Mean connectivity")) text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red") ``` I used a scale-free topology fit index **R^2 of 0.9**. This lowest recommended R^2 by Langfelder and Horvath is 0.8. I chose 0.9 because we want to use the smallest soft thresholding power that maximizes with model fit. It appears that our **soft thresholding power is 5** because it is the lowest power above the R^2=0.9 threshold that maximizes with model fit. ### Co-expression adjacency and topological overlap matrix similarity Save Rdata necessary for analysis ```{r} save(datExpr, file = "A-Pver/output/rna-seq/datExpr.RData") ``` Co-expression similarity and adjacency, using the soft thresholding power 5 and translate the adjacency into topological overlap matrix to calculate the corresponding dissimilarity. I will use a **signed network**. ```{r, } # #Set up workspace #getwd() #Display the current working directory # #If necessary, change the path below to the directory where the data files are stored. "." means current directory. On Windows use a forward slash / instead of the usual \. #workingDir = "."; # setwd(WGCNA_dev); # library(WGCNA) #Load the WGCNA package options(stringsAsFactors = FALSE) #The following setting is important, do not omit. enableWGCNAThreads() #Allow multi-threading within WGCNA. # # #Load the data saved in the first part adjTOM <- load(file="A-Pver/output/rna-seq/datExpr.RData") adjTOM # # #Run analysis softPower=5 #Set softPower to 5 adjacency=adjacency(datExpr, power=softPower,type="signed") #Calculate adjacency TOM= TOMsimilarity(adjacency,TOMType = "signed") #Translate adjacency into topological overlap matrix #this step can take awhile dissTOM= 1-TOM #Calculate dissimilarity in TOM #save(adjacency, TOM, dissTOM, file = "Mcap2020/Output/TagSeq/adjTOM.RData") #Save #save(dissTOM, file = "Mcap2020/Output/TagSeq/dissTOM.RData") #Save ``` Load in dissTOM file obtained from previous R chunk. ```{r} #dissTOM_in <- load(file="Mcap2020/Output/TagSeq/dissTOM.RData") #dissTOM_in ``` ### Clustering using TOM Form distance matrix ```{r} geneTree=flashClust(as.dist(dissTOM), method="average") ``` We will now plot a dendrogram of genes. Each leaf corresponds to a gene, branches grouping together densely are interconnected, highly co-expressed genes. ```{r} pdf(file="A-Pver/output/rna-seq/dissTOMClustering.pdf", width=20, height=20) plot(geneTree, xlab="", sub="", main= "Gene Clustering on TOM-based dissimilarity", labels= FALSE,hang=0.04) dev.off() ``` ### Module identification using dynamicTreeCut Module identification is essentially cutting the branches off the tree in the dendrogram above. We like large modules, so we set the **minimum module size** relatively high, so we will set the minimum size at 30. ```{r} minModuleSize = 30 dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM, deepSplit = TRUE, pamRespectsDendro = FALSE, minClusterSize = minModuleSize) table<-as.data.frame(table(dynamicMods)) #list modules and respective sizes table(dynamicMods) save(dynamicMods, geneTree, file = "A-Pver/output/rna-seq/dyMod_geneTree.RData") #Save to load into RStudio ``` There are 59 modules! We will merge modules in the next few steps. Load modules calculated from the adjacency matrix. ```{r} dyMod_geneTree <- load(file = "A-Pver/output/rna-seq/dyMod_geneTree.RData") dyMod_geneTree ``` Plot the module assignment under the gene dendrogram ```{r} #dynamicColors = labels2colors(dynamicMods) # Convert numeric labels into colors dynamicColors=dynamicMods table(dynamicColors) pdf(file="A-Pver/output/rna-seq/dissTOMColorClustering.pdf", width=20, height=20) plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05, main = "Gene dendrogram and module colors") dev.off() ``` ### Merge modules with similar expression profiles Plot module similarity based on eigengene value ```{r} #Calculate eigengenes MEList = moduleEigengenes(datExpr, colors = dynamicColors, softPower = 5) MEs = MEList$eigengenes #Calculate dissimilarity of module eigengenes MEDiss = 1-cor(MEs) #Cluster again and plot the results METree = flashClust(as.dist(MEDiss), method = "average") pdf(file="A-Pver/output/rna-seq/eigengeneClustering1.pdf", width = 20) plot(METree, main = "Clustering of module eigengenes", xlab = "", sub = "") dev.off() ``` **Merge modules with >85% eigengene similarity.** Most studies use somewhere between 80-90% similarity. I will use 85% similarity as my merging threshold. ```{r} MEDissThres= 0.15 #merge modules that are 85% similar pdf(file="A-Pver/output/rna-seq/eigengeneClustering2.pdf", width = 20) plot(METree, main = "Clustering of module eigengenes", xlab = "", sub = "") abline(h=MEDissThres, col="red") dev.off() merge= mergeCloseModules(datExpr, dynamicColors, cutHeight= MEDissThres, verbose =3) mergedColors= merge$colors mergedMEs= merge$newMEs pdf(file="A-Pver/output/rna-seq/mergedClusters.pdf", width=20, height=20) plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors), c("Dynamic Tree Cut", "Merged dynamic"), dendroLabels= FALSE, hang=0.03, addGuide= TRUE, guideHang=0.05) dev.off() ``` Save new modules ```{r} moduleLabels=mergedColors moduleColors = mergedColors # Rename to moduleColors #colorOrder = c("grey", standardColors(50)); # Construct numerical labels corresponding to the colors #moduleLabels = match(moduleColors, colorOrder)-1; MEs = mergedMEs; ncol(MEs) #How many modules do we have now? ``` We have 43 modules after merging with 85% similarity as compared to 59 before. Plot new tree with new modules. ```{r} #Calculate dissimilarity of module eigengenes MEDiss = 1-cor(MEs) #Cluster again and plot the results pdf(file="A-Pver/output/rna-seq/eigengeneClustering3.pdf") METree = flashClust(as.dist(MEDiss), method = "average") MEtreePlot = plot(METree, main = "Clustering of module eigengenes", xlab = "", sub = "") dev.off() ``` Display table of module gene counts. ```{r} table(mergedColors) table<-as.data.frame(table(mergedColors)) write_csv(table, "A-Pver/output/rna-seq/genes_per_module.csv") ``` The number of genes per module range from 38 to 4168. ## Relating modules to treatment ### Quantifying module–trait associations Prepare trait data. Data has to be numeric, so I will substitute treatment for numeric values. The "trait" we are considering here is treatment. Make a dataframe that has a column for each treatment and a row for samples. Populate a 1 for samples in the enriched treatment and a 0 for samples in the control treatment. This process changes treatment from a categorical variable into a binary variable. This will allow for correlations between mean eigengenes and treatment. We are doing treatment for now, but will do physiological variables next. ```{r} metadata_ordered$num <- c("1") allTraits <- as.data.frame(pivot_wider(metadata_ordered, names_from = Treatment, values_from = num, id_cols = FragmentID)) allTraits[is.na(allTraits)] <- c("0") rownames(allTraits) <- allTraits$FragmentID datTraits <- allTraits[,c(-1)] datTraits ``` Define numbers of genes and samples and print. ```{r} nGenes = ncol(datExpr) nSamples = nrow(datExpr) nGenes nSamples ``` We have 20,326 genes and 32 samples. Generate labels for module eigengenes as numbers. ```{r} MEs0 = moduleEigengenes(datExpr, moduleLabels, softPower=5)$eigengenes MEs = orderMEs(MEs0) names(MEs) ``` Our module names are: [1] "ME33" "ME11" "ME3" "ME14" "ME17" "ME35" "ME26" "ME46" "ME55" "ME53" "ME57" "ME31" [13] "ME32" "ME49" "ME56" "ME9" "ME41" "ME18" "ME34" "ME5" "ME25" "ME39" "ME21" "ME10" [25] "ME20" "ME2" "ME50" "ME42" "ME22" "ME23" "ME8" "ME52" "ME58" "ME1" "ME28" "ME54" [37] "ME15" "ME12" "ME27" "ME47" "ME7" "ME6" "ME43" Correlations of treatment with eigengenes ```{r} moduleTraitCor = cor(MEs, datTraits, use = "p"); moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples); Colors=sub("ME","", names(MEs)) moduleTraitTree = hclust(dist(t(moduleTraitCor)), method = "average") #pdf(file="A-Pver/output/rna-seq/ModuleTraitClusterTree.pdf") #plot(moduleTraitTree) #dev.off() ``` Correlations of genes with eigengenes. Calculate correlations between ME's and treatment ```{r} moduleGeneCor=cor(MEs,datExpr) moduleGenePvalue = corPvalueStudent(moduleGeneCor, nSamples); ``` Calculate kME values (module membership). ```{r} datKME = signedKME(datExpr, MEs, outputColumnName = "kME") head(datKME) ``` ### Plot module-treatment associations Generate a complex heatmap of module-treatment relationships. ```{r} #bold sig p-values #dendrogram with WGCNA MEtree cut-off #colored y-axis #Create list of pvalues for eigengene correlation with specific life stages heatmappval <- signif(moduleTraitPvalue, 1) #Make list of heatmap row colors htmap.colors <- names(MEs) htmap.colors <- gsub("ME", "", htmap.colors) treatment_order<-c("Control", "Enriched") library(dendsort) row_dend = dendsort(hclust(dist(moduleTraitCor))) col_dend = dendsort(hclust(dist(t(moduleTraitCor)))) pdf(file = "A-Pver/output/rna-seq/Module-trait-relationship-heatmap.pdf", height = 16, width = 8) Heatmap(moduleTraitCor, name = "Eigengene", row_title = "Gene Module", column_title = "Module-Lifestage Eigengene Correlation", col = blueWhiteRed(50), row_names_side = "left", #row_dend_side = "left", width = unit(5, "in"), height = unit(12, "in"), #column_dend_reorder = TRUE, #cluster_columns = col_dend, row_dend_reorder = TRUE, #column_split = 2, row_split=3, #column_dend_height = unit(.5, "in"), column_order = treatment_order, cluster_rows = row_dend, row_gap = unit(2.5, "mm"), border = TRUE, cell_fun = function(j, i, x, y, w, h, col) { if(heatmappval[i, j] < 0.05) { grid.text(sprintf("%s", heatmappval[i, j]), x, y, gp = gpar(fontsize = 10, fontface = "bold")) } else { grid.text(sprintf("%s", heatmappval[i, j]), x, y, gp = gpar(fontsize = 10, fontface = "plain")) }}, column_names_gp = gpar(fontsize = 12, border=FALSE), column_names_rot = 35, row_names_gp = gpar(fontsize = 12, alpha = 0.75, border = FALSE)) #draw(ht) dev.off() ``` There are two clusters of modules that are correlated with the enriched treatment and one cluster of modules associated with the control treatment. ## Plot mean eigengene over treatments View module eigengene data and make dataframe for Strader plots. ```{r} head(MEs) names(MEs) Strader_MEs <- MEs Strader_MEs$Treatment <- metadata_ordered$Treatment Strader_MEs$FragmentID <- rownames(Strader_MEs) head(Strader_MEs) Strader_MEs<-Strader_MEs%>% droplevels() #drop unused level dim(Strader_MEs) head(Strader_MEs) ``` Plot mean module eigengene for each module. ```{r} #convert wide format to long format for plotting plot_MEs<-Strader_MEs%>% gather(., key="Module", value="Mean", ME33:ME43) write_csv(plot_MEs, "A-Pver/output/rna-seq/WGCNA_eigengene_expression.csv") plot_MEs<- read_csv("A-Pver/output/rna-seq/WGCNA_eigengene_expression.csv") #dev.off() treatment_order = c("Control", "Enriched") #Set time_point order list_groups<-c("Control", "Enriched") levels(as.factor(plot_MEs$Treatment)) plot_MEs$Treatment<-factor(plot_MEs$Treatment, levels=list_groups) expression_plots<-plot_MEs%>% group_by(Module, Treatment) %>% ggplot(aes(x=Treatment, y=Mean, group=Treatment, colour=Treatment)) + facet_wrap(~ Module)+ geom_jitter(alpha = 0.5) + geom_boxplot(alpha=0) + scale_x_discrete(name="", limits=treatment_order) + #ylim(-0.5,1) + ylab("Mean Module Eigenegene") + geom_hline(yintercept = 0, linetype="dashed", color = "grey")+ theme_bw() + theme(axis.text.x=element_text(angle = 45, hjust=1, size = 12), #set x-axis label size axis.title.x=element_text(size = 14), #set x-axis title size axis.ticks.x=element_blank(), #No x-label ticks #axis.title.y=element_blank(), #No y-axis title axis.text.y=element_text(size = 14), #set y-axis label size, panel.border = element_rect(color = "black", fill = NA, size = 1), #set border panel.grid.major = element_blank(), #Set major gridlines panel.grid.minor = element_blank(), #Set minor gridlines axis.line = element_line(colour = "black"), #Set axes color plot.background=element_blank(), plot.title = element_text(size=22)); expression_plots ggsave(expression_plots, file="A-Pver/output/rna-seq/expression_eigengene.jpeg", height=16, width=10) ``` # WGCNA analysis to physiological variables Download data from Danielle's repository for this project. Thermal performance curve metrics: https://raw.githubusercontent.com/daniellembecker/Chronic_low_nutrient_enrichment_benefits_coral_thermal_performance_fore_reef_habitat/master/Thermal_Performance/Data/Topt_data.csv Photosynthesis curve data: https://raw.githubusercontent.com/daniellembecker/Chronic_low_nutrient_enrichment_benefits_coral_thermal_performance_fore_reef_habitat/master/Thermal_Performance/Data/Pmax_data.csv Chlorophyll and cell density: https://raw.githubusercontent.com/daniellembecker/Chronic_low_nutrient_enrichment_benefits_coral_thermal_performance_fore_reef_habitat/master/Endosymbiont_Coral_Response/Data/chl_zoox_sheet.csv Ash free dry weight: https://raw.githubusercontent.com/daniellembecker/Chronic_low_nutrient_enrichment_benefits_coral_thermal_performance_fore_reef_habitat/master/Endosymbiont_Coral_Response/Data/AFDW.csv ## Load data and prepare ```{r} pmax<-read.csv("https://raw.githubusercontent.com/daniellembecker/Chronic_low_nutrient_enrichment_benefits_coral_thermal_performance_fore_reef_habitat/master/Thermal_Performance/Data/Pmax_data.csv", sep=",", header=TRUE) ``` Prepare data to merge with above wgcna data frames. ```{r} pmax_df<-pmax%>% select(fragment.ID, Topt, rate.type)%>% mutate(fragment.ID=str_remove(fragment.ID, "_.*"))%>% pivot_wider(names_from="rate.type", values_from="Topt")%>% rename(Topt_R=`Dark Respiration`, Topt_P=`Gross Photosynthesis`) pmax_df$fragment.ID<-gsub("([[:alpha:]])([[:digit:]])", "\\1_\\2", pmax_df$fragment.ID) #insert underscore betweeen letter and first number in fragment ID ``` ```{r} str(datTraits) datTraits$Topt_R<-pmax_df$Topt_R[match(rownames(datTraits), pmax_df$fragment.ID)] datTraits$Topt_P<-pmax_df$Topt_P[match(rownames(datTraits), pmax_df$fragment.ID)] str(datTraits) ``` ## Run correlation WGCNA Correlations of treatment with eigengenes ```{r} moduleTraitCor = cor(MEs, datTraits, use = "p"); moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples); Colors=sub("ME","", names(MEs)) moduleTraitTree = hclust(dist(t(moduleTraitCor)), method = "average") #pdf(file="A-Pver/output/rna-seq/ModuleTraitClusterTree.pdf") #plot(moduleTraitTree) #dev.off() ``` Correlations of genes with eigengenes. Calculate correlations between ME's and treatment ```{r} moduleGeneCor=cor(MEs,datExpr) moduleGenePvalue = corPvalueStudent(moduleGeneCor, nSamples); ``` Calculate kME values (module membership). ```{r} datKME = signedKME(datExpr, MEs, outputColumnName = "kME") head(datKME) ``` ### Plot module-treatment associations Generate a complex heatmap of module-treatment relationships. ```{r} #bold sig p-values #dendrogram with WGCNA MEtree cut-off #colored y-axis #Create list of pvalues for eigengene correlation with specific life stages heatmappval <- signif(moduleTraitPvalue, 1) #Make list of heatmap row colors htmap.colors <- names(MEs) htmap.colors <- gsub("ME", "", htmap.colors) treatment_order<-c("Control", "Enriched", "Topt_R", "Topt_P") library(dendsort) row_dend = dendsort(hclust(dist(moduleTraitCor))) col_dend = dendsort(hclust(dist(t(moduleTraitCor)))) pdf(file = "A-Pver/output/rna-seq/Module-trait-relationship-heatmap.pdf", height = 16, width = 8) Heatmap(moduleTraitCor, name = "Eigengene", row_title = "Gene Module", column_title = "Module-Lifestage Eigengene Correlation", col = blueWhiteRed(50), row_names_side = "left", #row_dend_side = "left", width = unit(5, "in"), height = unit(12, "in"), #column_dend_reorder = TRUE, #cluster_columns = col_dend, row_dend_reorder = TRUE, #column_split = 2, row_split=8, #column_dend_height = unit(.5, "in"), column_order = treatment_order, cluster_rows = row_dend, row_gap = unit(2.5, "mm"), border = TRUE, cell_fun = function(j, i, x, y, w, h, col) { if(heatmappval[i, j] < 0.05) { grid.text(sprintf("%s", heatmappval[i, j]), x, y, gp = gpar(fontsize = 10, fontface = "bold")) } else { grid.text(sprintf("%s", heatmappval[i, j]), x, y, gp = gpar(fontsize = 10, fontface = "plain")) }}, column_names_gp = gpar(fontsize = 12, border=FALSE), column_names_rot = 35, row_names_gp = gpar(fontsize = 12, alpha = 0.75, border = FALSE)) #draw(ht) dev.off() ``` ## Plot mean eigengene over variables Plot these as correlations for each modle across quantitative variables