---
title: "qPCR data analysis"
output: html_document
editor_options: 
  chunk_output_type: console
---

## load libraries
```{r}
library(readxl)
library(tidyr)
library(dplyr)
library(ggplot2)
library(ggpubr)
library(plotrix)
library(mgcv)
library(ggsci)
library(gplots)


```


## read in data

```{r}
#read in roberto's gene_count_matrix.csv
rob_counts <- read.csv("gene_count_matrix (1).csv")

#read in Emma's gene counts matrix
nfcore_counts <- read_xlsx("salmon_merged_gene_counts.xlsx")

#read in meta data
meta_data <- read.csv("SraRunTable.csv")

#read in roberto's data that has been subsetted for genes matched to LOC IDs using the genome compare files (see jupyter notebook)
sub_rob_counts <- read.csv("gene_count_matrix_roberto_CGI2LOC_prev.csv", header =F)
#add header robertos subsetted gene counts matrix
colnames(sub_rob_counts) <- c("gene_id", "gene_name", colnames(rob_counts[,2:ncol(rob_counts)]))

#read in Emma's data that has been subsetted for genes that overlap with roberto's subsetted genes
sub_nfcore_counts <- nfcore_counts[which(nfcore_counts$gene_id %in% sub_rob_counts$gene_id),]

#read in genes ID'd as high exp genes in heat resistant oysteres from Roberto's paper (copied from supp table 6)
hi_genes <- read_xlsx("heat_tol_high_exp_genes.xlsx")

```


## Check subsetted data
```{r}
#count rows in Emma's subsetted data
nrow(sub_nfcore_counts)
# 1901

nrow(sub_rob_counts)
#1916

#There are more genes in Roberto's subsetted data than in Emma's meaning some LOC ids are listed multiple times
#further subset Roberto's data so that it only contains the unique IDs that are in Emma's subsetted data
sub_rob_counts <- sub_rob_counts[which(sub_rob_counts$gene_id %in% nfcore_counts$gene_id),]

nrow(sub_rob_counts)
#1913

#There are still duplicate LOC ids remaining in the list because there are still more genes in Roberto's data
#make vector of duplicated LOC ids in Roberto's data
dups <- sub_rob_counts[duplicated(sub_rob_counts$gene_id),]$gene_id

#exclude LOC IDs that have multiple CGIs associated with them. (These are genes that were previously annotated as multiple genes and are now merged into one)
sub_nfcore_counts <- sub_nfcore_counts %>% filter(!(gene_id %in% dups))
sub_rob_counts <- sub_rob_counts %>% filter(!(gene_id %in% dups))

nrow(sub_nfcore_counts)
#1890
nrow(sub_rob_counts)
#1890

#now there are the same number of rows and genes!
```


## format data
```{r}
#convert matrix to long format (all sample columns condense into one column with counts in another column)
rob_counts_l <- rob_counts %>% pivot_longer(2:ncol(rob_counts), names_to = "Library.Name", values_to = "counts")
#convert to long format
nfcore_counts_l <- nfcore_counts %>% pivot_longer(3:ncol(nfcore_counts), names_to = "Experiment", values_to = "counts")

#merge with meta data
rob_counts_l <- merge(meta_data[,c("Experiment", "Library.Name", "family", "timepoint..day.","trait")], rob_counts_l, by = "Library.Name")
#merge with meta data
nfcore_counts_l <- merge(meta_data[,c("Experiment", "Library.Name", "family", "timepoint..day.","trait")],nfcore_counts_l, by = "Experiment")


#convert subsetted data to long format and merge with meta data
sub_rob_counts_l <- sub_rob_counts %>% pivot_longer(3:ncol(sub_rob_counts), names_to = "Library.Name", values_to = "counts") %>% left_join(meta_data[,c("Experiment", "Library.Name", "family", "timepoint..day.","trait")], by = "Library.Name")

sub_nfcore_counts_l <- sub_nfcore_counts %>% pivot_longer(3:ncol(sub_nfcore_counts), names_to = "Experiment", values_to = "counts") %>% left_join(meta_data[,c("Experiment", "Library.Name", "family", "timepoint..day.","trait")], by = "Experiment")

#add column to denote method
rob_counts_l$method <- "old"
nfcore_counts_l$method <- "nf-core"
sub_nfcore_counts_l$method <- "nf-core"
sub_rob_counts_l$method <- "old"

#remove gene_name column from Emma's data so rows from Roberto's data and Emma's data can be combined
nfcore_counts_l$gene_name <- NULL

#combine roberto's and emma's data into one df
all_data <- rbind(rob_counts_l,nfcore_counts_l)
all_sub_data <- rbind(sub_rob_counts_l, sub_nfcore_counts_l)


#create a vector of samples to exclude from data because Roberto excluded these from his comparisons
excluded <- c("Os22", "Os23", "Os24", "Os25", "Os26","Os27", "Os4", "Os5", "Os6", "Os7", "Os8", "Os9")

#exclude samples Roberto excluded from analysis, remove gene_name column and spread methods column out so that it can be plotted as as a scatter plot
all_sub_data <- all_sub_data %>% filter(!(Library.Name %in% excluded))%>% dplyr::select(-gene_name) %>% pivot_wider(names_from = method, values_from = counts)

```


## plot histograms for each library
```{r}
jpeg("histograms.jpg", width = 7, height = 5, units = "in", res = 300)
all_data %>% filter(!(Library.Name %in% excluded))%>% ggplot(aes(log10(counts), fill = method)) + geom_histogram(alpha = 0.5) + labs(x = "counts (log10)", y = "# genes") + theme_bw() + facet_wrap(~Library.Name)
dev.off()

jpeg("density_plots.jpg", width = 7, height = 5, units = "in", res = 300)
all_data %>% filter(!(Library.Name %in% excluded))%>% ggplot(aes(log10(counts), color = method,fill = method, alpha = 0.2)) + geom_density(alpha = 0.2) + labs(x = "counts (log10)", y = "% genes") + theme_bw() + facet_wrap(~Library.Name)
dev.off()

```

## make scatter plot of each library
```{r}
#create scatter plots of nf-core counts x roberto's counts for each sample
jpeg("corplot_2K_eachSample.jpg", width = 10, height = 7, units = "in", res = 300)
 all_sub_data%>% ggplot(aes(log10(old+0.5),log10(`nf-core`+0.5))) + geom_point() + labs(x = "Roberto counts (log10(+0.5))", y = "nf-core counts (log10(+0.5))") + theme_bw() + facet_wrap(~Library.Name) + ggtitle("1,890 overlapping genes comparing to prev.txt doc")
dev.off()

#create on scatter plot of nf-core counts x roberto's counts for all samples
jpeg("corplot_2K.jpg", width = 5, height = 4, units = "in", res = 300)
all_sub_data%>% ggplot(aes(log10(old+0.5),log10(`nf-core`+0.5))) + geom_point() + labs(x = "Roberto counts (log10(+0.5))", y = "nf-core counts (log10(+0.5))") + theme_bw() + ggtitle("1,890 overlapping genes comparing to prev.txt doc")
dev.off()


```


make bins for different zero and non-zero categories
```{r}
all_sub_data$zero_cat <- NA
all_sub_data$zero_cat[which(all_sub_data$old==0 & all_sub_data$`nf-core`==0)] <- "both_0"
all_sub_data$zero_cat[which(all_sub_data$old==0 & all_sub_data$`nf-core`!=0)] <- "emma_only"
all_sub_data$zero_cat[which(all_sub_data$old!=0 & all_sub_data$`nf-core`==0)] <- "rob_only"
all_sub_data$zero_cat[which(all_sub_data$old!=0 & all_sub_data$`nf-core`!=0)] <- "both_non0"


jpeg("hist_zeroVSnonzeros.jpg", width = 8, height = 3, units = "in", res = 300)
all_sub_data %>% filter(zero_cat != "both0" & zero_cat !="both_non0") %>% pivot_longer(7:8, names_to = "method", values_to = "count")%>% filter(count !=0) %>%ggplot(aes(x = log10(count), fill = zero_cat)) + geom_histogram(position = position_dodge()) + theme_bw() + labs(x = "counts(log10)", y= "# genes", fill = "method") + ggtitle("Genes with 0 counts in one method and non-zero in the other\n (9,065 non-zero in Emma's and 796 non-zero in Roberto's only)")
dev.off()

jpeg("scatter_nonzeros.jpg", width = 5, height = 4, units = "in", res = 300)
all_sub_data%>% filter(zero_cat =="both_non0") %>% ggplot(aes(log10(old+0.5),log10(`nf-core`+0.5))) + geom_point() + labs(x = "Roberto counts (log10(+0.5))", y = "nf-core counts (log10(+0.5))") + theme_bw() + ggtitle("3,959 genes with non-zero counts in both methods ")
dev.off()

jpeg("hexbin_nonzeros.jpg", width = 5, height = 4, units = "in", res = 300)
all_sub_data%>% filter(zero_cat =="both_non0") %>% ggplot(aes(old,`nf-core`)) + geom_hex() + labs(x = "Roberto counts", y = "nf-core counts") + theme_bw() + ggtitle("3,959 genes with non-zero counts in both methods ")
dev.off()


```


## plot distribution of counts for genes Roberto id'd as highly expressed in resistant oysters
```{r}
jpeg("heat_resistant_high_exp_genes_boxplots.jpg", width = 13, height = 10, units = "in", res = 300)
nfcore_counts_l %>% filter(gene_id %in% hi_genes$gene_id) %>% ggplot(aes(x = trait,y = log10(counts+0.5), group = interaction(gene_id,trait),fill = trait))+ geom_violin(alpha = 0.5)+ geom_boxplot(width = 0.2, alpha = 0.2, outlier.shape = NA)+geom_jitter(alpha = 0.5,size = 1.2, width = 0.1) + theme_bw()+ theme(axis.text.x = element_text(angle = 45, hjust = 1))+ labs(y = "log10(counts + 0.5)", fill = "sample")+ facet_wrap(~gene_id, scale = "free") +ggtitle("genes Roberto id'd as high exp. in resistant samples")+  theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
dev.off()

```


#make heatmap of all genes * this is incomplete

```{r}
Rmatrix <- as.matrix(rob_counts[,!(colnames(rob_counts)%in%excluded)] %>% dplyr::select(-gene_id) )

Nmatrix <- as.matrix(nfcore_counts[,!(colnames(nfcore_counts)%in%excluded)]  %>% dplyr::select(-gene_id) %>% dplyr::select(-gene_name))


Rmatrix <- Rmatrix[rowSums(Rmatrix[])>0,]
Nmatrix <- Nmatrix[rowSums(Nmatrix[])>0,]
Rmatrix[Rmatrix==0] <- 0.5
Nmatrix[Nmatrix==0] <- 0.5

heatmap.2(log10(Rmatrix), dendrogram = "both", tracecol = NA)
  
```