---
title: "40-gene-methylation"
output: html_document
---

Lets see if we can get gene methylation with a 10x bed file.. and gene gff via intersect bed...

## NOTE: Due to the large file sizes generated in this Rmd file, numerous intermediate files cannot
## be uploaded to GitHub. Thus, each user will have to run this Rmd file themselves.

```
#Use intersectBed to find where loci and genes intersect, allowing loci to be mapped to annotated genes
#wb: Print all lines in the second file
#a: file that ends in posOnly
#b: annotated gene list
#Save output in a new file that has the same base name and ends in -Annotated.txt
for f in RAnalysis/data/*F_R1_val_1_10x.tab
do
  intersectBed \
  -wb \
  -a ${f} \
  -b RAnalysis/data/C_virginica-3.0_Gnomon_genes.bed \
  > ${f}_gene
done
```

# Load libraries
```{r load-libraries}
library("tidyverse")
library("Rfast") # For row-wise coeeficient of variation calcs
library("RColorBrewer")
library("ggExtra")
```
## Set variables

```{r set-variables}
# Vectors for subsetting samples by different groups
all <- c("S13M", "S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S64M", "S6M", "S76F", "S7M", "S12M", "S22F", "S23M", "S29F", "S31M", "S35F", "S36F", "S3F", "S41F", "S48M", "S50F", "S59M", "S77F", "S9M")
controls <- c("S13M", "S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S64M", "S6M", "S76F", "S7M")
exposed <- c("S12M", "S22F", "S23M", "S29F", "S31M", "S35F", "S36F", "S3F", "S41F", "S48M", "S50F", "S59M", "S77F", "S9M")
controls.males <- c("S13M", "S64M", "S6M", "S7M")
exposed.males <- c("S12M", "S23M", "S31M", "S48M", "S59M", "S9M")
controls.females <- c("S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S76F")
exposed.females <- c("S22F", "S29F", "S35F", "S36F", "S3F", "S41F", "S50F", "S77F")
females <- c(controls.females, exposed.females)
males <- c(controls.males, exposed.males)

# Declare list for storing methylation dataframes
mean.gene.methylation.CoV.list <- list()

# Vector of treatment comparisons
comparisons <- c("all",
                 "females_v_males",
                 "controls_v_exposed",
                 "controls.females_v_exposed.females",
                 "controls.males_v_exposed.males",
                 "controls.females_v_controls.males",
                 "exposed.females_v_exposed.males"
                 )

```

# Download bedgraph files

```{bash download-bedgraphs}
mkdir --parents ../data/big

cd ../data/big

wget -r \
--no-check-certificate \
--continue \
--quiet \
--no-directories --no-parent \
-P . \
-A R1_val_1_10x.bedgraph \
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/120321-cvBS/

```


# Examine data files
```{bash glance-at-bedgraph-format}
head ../data/big/3F_R1_val_1_10x.bedgraph
```
```{bash glance-at-genes-BED}
head ../genome-features/C_virginica-3.0_Gnomon_genes.bed
```

# Test intersectBed command
```{bash}
NAME=5F

/home/shared/bedtools2/bin/intersectBed \
-wb \
-a ../data/big/3F_R1_val_1_10x.bedgraph \
-b ../genome-features/C_virginica-3.0_Gnomon_genes.bed \
| awk -v name=$NAME -v OFS="\t" '{ print $0, name}' \
| head
```


# Run intersectBed on all files
```{bash run-intersectBed-on-all-files}
 cd ../data/big/
 
 FILES=$(ls *bedgraph)
 
 cd -
 
 for file in ${FILES}
 do
    NAME=$(echo ${file} | awk -F "_" '{print $1}')
    echo ${NAME}
   /home/shared/bedtools2/bin/intersectBed \
   -wb \
   -a ../data/big/${NAME}_R1_val_1_10x.bedgraph \
   -b ../genome-features/C_virginica-3.0_Gnomon_genes.bed \
   | awk -v name=$NAME -v OFS="\t" '{ print $0, name}' \
   > ../output/40-gene-methylation/${NAME}_mGene.out
 done  


```

## Glance at example output
```{bash glance-at-example-output}
head ../output/40-gene-methylation/36F_mGene.out
```

# Concatenate all gene methylation files
```{bash concatenate-all-gene-methylation-files}
cat ../output/40-gene-methylation/*_mGene.out > ../output/40-gene-methylation/meth_all-samples.out
```


# Read in gene methylation file
```{r}
meth_all <- read.delim("../output/40-gene-methylation/meth_all-samples.out", header = FALSE)
```

## View gene methylation dataframe
```{r view-gene-methylation-dataframe}
meth_all
```
# Plot methylation distribution
```{r plot-methylation-disctribution}
ggplot() +
  geom_histogram(data = meth_all, aes(x = V4), fill = "blue", alpha = 0.2) 
```

# Calculate mean methylation per gene per sample
```{r calculate-mean-methylation-per-gene-per-sample}
gm <- meth_all %>%
   mutate(art = paste(V8, V11, sep = '_')) %>%
   group_by(art) %>%
   summarize(avg_meth = mean(V4))

```

## View mean methylation per gene per sample
```{r view-mean-methylation-per-gene-per-sample}
gm
```

```{r}
mc <- inner_join(gm, ic, by = "art") %>%
   separate(art, into = c("gene", "sample"), sep = "_") %>%
   separate(sample, into = c("number", "sex"), sep = -1)

```

```{r, fig.width=20,fig.height=10}

ggplot(mc, aes(x = avg_meth, y = isoform_count, color = sex)) +
  geom_point(alpha = 0.6, size = 1) +
  facet_wrap(~number) +
  geom_hline(yintercept=10)
```
```{r, fig.width=20,fig.height=10}

ggplot(mc, aes(x = avg_meth, y = isoform_count, color = sex)) +
  geom_point(alpha = 0.6, size = 1) +
  facet_wrap(~number) +
  stat_smooth(method = glm, method.args = list(family = binomial))
```


```{r}

p <- ggplot(mc, aes(x = avg_meth, y = isoform_count)) +
  geom_point(alpha = 0.2, size = 1) +
  facet_wrap(~sex) 
ggMarginal(p, type = "histogram")
```


```{r}
ggplot(mc, aes(x = avg_meth)) +
  geom_histogram() +
  facet_wrap(~sex)
```


# Create gene methylation file where libary is row

```{r write-mean-gene-methylation-to-file}
gm %>%
  separate(art, into = c("gene", "sample_id"), sep = '_') %>%
  spread(value = avg_meth, key = gene) %>%
  write.csv(., "../output/40-gene-methylation/40-gene-methylation.csv")
```


# Reorient gene methylation data
```{r reorient-mean-methylation-data}
methylation <- gm %>%
  separate(art, into = c("gene", "sample_id"), sep = '_') %>%
  spread(value = avg_meth, key = gene)

head(methylation)
```

## Save methylation object

Creating this object is useful for skipping time-consuming steps above.
when needing to use this notebook in the future.

Also avoids the need for high-memory computer that the previous steps above require.
```{r save-methylation-object}
save(methylation, file = "../output/40-gene-methylation/methylation.RData")
```

## Prepare methylation file for joining with BED
```{r prep-methylation-file-for-joining-with-BED}

# Need to transpose to get gene names as rows
# For joining with BED
methylation.transposed <- as.data.frame(t(methylation))

# Convert rownames to a column of data
methylation.transposed.rownames <- rownames_to_column(methylation.transposed)

# # Convert first row to header
names(methylation.transposed.rownames) <- methylation.transposed.rownames[1, ]
methylation.transposed.rownames <- methylation.transposed.rownames[-1, ]

# Replace "." in gene names with "-"
# Will match gene names in BED file
methylation.transposed.rownames$sample_id <- str_replace_all(methylation.transposed.rownames$sample_id, "\\.", "-")

# Rename sample name columns to match sample grouping vectors
# Capture column names
# Took lazy approach and applied to all columns instead of subsetting desired columns
original_cols <- colnames(methylation.transposed.rownames)

# Append "S" to beginning of all samples names
colnames(methylation.transposed.rownames) <- paste("S", original_cols, sep = "")

# Replace "Ssample_id" with "name" to match BED file
names(methylation.transposed.rownames)[names(methylation.transposed.rownames) == "Ssample_id"] <- "name"

# Capture gene names
gene.names <- pull(methylation.transposed.rownames, name)

head(methylation.transposed.rownames)


```


## Convert to matrix
```{r convert-to-matrix}
# Convert first column to rownames for rowcvs() compatibility - requires matrix as input
methylation.transposed.rownames.rownames <- methylation.transposed.rownames[,-1]

# Assign gene names to rownames - needed for rowcvs() compatibility - requires matrix as input
rownames(methylation.transposed.rownames.rownames) <- methylation.transposed.rownames[,1]

# Convert from "characters" to numbers
methylation.transposed.rownames.rownames.converted <- sapply(methylation.transposed.rownames.rownames, as.numeric)

# Coerce into matrix for use with `rowcvs()` function
methylation.transposed.matrix <- data.matrix(methylation.transposed.rownames.rownames.converted, rownames.force = TRUE)

head(methylation.transposed.matrix)

```

## Calculate mean gene methylation coefficients of variation (CoV) for all samples
```{r calculate-mean-gene-methylation-coefficients-of-variation-for-all-samples}


for (comparison in comparisons) {
  
  # Parse comparisons to use subsequent strings for
  # column labels and file naming
  
  ## Get the left side of the comparison
  first_comparison <- sapply(
    strsplit(comparison, "_"),
    head,1
    )
  
  ## Get the right side of the comparison
  second_comparison <- sapply(
    strsplit(comparison,"_"), 
    tail,1
    )
  
  # Set first CoV treatment column name
  first_treatment_col_CoV <- paste(first_comparison,
                         "mean.methylation.CoV",
                         sep = ".")
  
  # Set second CoV treatment column name
  second_treatment_col_CoV <- paste(second_comparison,
                         "mean.methylation.CoV",
                         sep = ".")
  
  # Set first mean methylation treatment column name
  first_treatment_col_mean_methylation <- paste(first_comparison,
                         "mean.methylation",
                         sep = ".")
  
  # Set second mean methylation treatment column name
  second_treatment_col_mean_methylation <- paste(second_comparison,
                         "mean.methylation",
                         sep = ".")
  
  # Calculate coefficients of variation
  if (comparison != "all") {
    
    # Calculate coefficients of variation using rowcvs(), convert to data frame and add to list
    # Uses get() to convert string to name of the vector stored in "first_comparison"
    mean.gene.methylation.CoV.list[[comparison]][first_treatment_col_CoV] <-
      as.data.frame(
        methylation.transposed.matrix %>%
          subset(., TRUE, get(first_comparison)) %>% 
        rowcvs(., unbiased = TRUE)
        )
    
    # Calculate coefficients of variation using rowcvs(), convert to data frame and add to list
    # Uses get() to convert string to name of the vector stored in "second_comparison"
    mean.gene.methylation.CoV.list[[comparison]][second_treatment_col_CoV] <-
      as.data.frame(
        methylation.transposed.matrix %>%
          subset(., TRUE, get(second_comparison)) %>%
        rowcvs(., unbiased = TRUE)
      )
    
    # Calculate mean gene methylation using rowMeans(), convert to data frame and add to list.
    # Uses get() to convert string to name of the vector stored in "first_comparison"
    mean.gene.methylation.CoV.list[[comparison]][first_treatment_col_mean_methylation] <-
      as.data.frame(
        methylation.transposed.matrix %>%
          subset(., TRUE, get(first_comparison)) %>% 
        rowMeans(.,)
        )
    
    # Calculate mean gene methylation using rowMeans(), convert to data frame and add to list.
    # Uses get() to convert string to name of the vector stored in "second_comparison"  
    mean.gene.methylation.CoV.list[[comparison]][second_treatment_col_mean_methylation] <-
      as.data.frame(
        methylation.transposed.matrix %>%
          subset(., TRUE, get(second_comparison)) %>% 
        rowMeans(.,)
        )
    
    # Convert to data frames
    # Commands above create lists of lists
    mean.gene.methylation.CoV.list <- lapply(mean.gene.methylation.CoV.list, as.data.frame)
    
    # Calculate mean gene methylation of the comparison using rowMeans(), convert to matrix and add to list.   
    mean.gene.methylation.CoV.list[[comparison]]$mean.methylation <- 
  rowMeans(as.matrix(mean.gene.methylation.CoV.list[[comparison]][c(first_treatment_col_mean_methylation, second_treatment_col_mean_methylation)]))
    
    
    # Calculate absolute value of difference in methylation between the first and second components
    # of the comparison and add to list.
    mean.gene.methylation.CoV.list[[comparison]]$abs.delta.CoV <- 
      abs(mean.gene.methylation.CoV.list[[comparison]][,first_treatment_col_CoV] - mean.gene.methylation.CoV.list[[comparison]][,second_treatment_col_CoV])

  } else { mean.gene.methylation.CoV.list[[comparison]] <-
  as.data.frame(methylation.transposed.matrix %>%
    rowcvs(., unbiased = TRUE)) %>%
  rename("CoV" = 1)
  
  # Calculate mean gene methylation of the comparison using rowMeans(), convert to data frame and add to list.
  # Since the operation creates a data frame as output, the data frame only has one column.
  # Thus, using the rowMeans(.,))[, 1] operates on the first (and only) column
  mean.gene.methylation.CoV.list[[comparison]]$mean.methylation <-
    as.data.frame(methylation.transposed.matrix %>%
                    rowMeans(.,))[, 1]

    
  }
}


```

## Save mean.gene.methylation.CoV.list as R Object

Useful for quick access to list without need for processing all steps above.
```{r save-methylation-CoV-list-object}
# Save as an object to avoid having to re-run all of the code which generates this list (it takes a long time to run)
save(mean.gene.methylation.CoV.list, file = "../output/40-gene-methylation/mean.gene.methylation.CoV.list.RData")

```

## Add gene names to rows for CoV data frames
```{r gene-names-to-CoV-dataframes}
# Apply gene names to column
mean.gene.methylation.CoV.list <- lapply(mean.gene.methylation.CoV.list,
                                         function(x){ rownames(x) <- gene.names; x }
                                         )

# Convert row names to column and assign column name
mean.gene.methylation.CoV.list <- lapply(mean.gene.methylation.CoV.list,
                                         function(x){ rownames_to_column(x, var = "gene.id") }
                                         )

```

## Scatter plot all mean gene methylation CoV comparisons

### Colored by absolute delta CoV

Since all data frames are formatted the same (gene.id, comp1.CoV, comp2.CoV, comp1.mean.methylation, comp2.mean.methylation, mean.methylation, abs.delta.CoV) we can refer to them by column numbers.

HOWEVER, to do that, we need to extract the name of each data frame, using the `names()` function.

IMPORTANT: These plots do _not_ plot rows containing `NA` or `NaN`!!! As such, there is
a lot of missing data.
```{r scatter-plot-all-mean-gene-methylation-CoV-by-abs.delta}

# Find the maximum value across all data frames
max_abs.delta.CoV <- max(sapply(mean.gene.methylation.CoV.list, function(df) max(df$abs.delta.CoV, na.rm = TRUE)), na.rm = TRUE)


# Set color palette
# Not entirely sure how this works, as I took it from some plotting Steven was doing.
myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))

# Set scale colors for absolute values of the delta between comparisons
# Not entirely sure how this works, as I took it from some plotting Steven was doing.
sc <- scale_colour_gradientn(colours = myPalette(100), limits=c(0, max_abs.delta.CoV))

# Generate scatterplots
# Uses the following columns:

# 2 - comp1.CoV
# 3 - comp2.CoV
# 7 - abs.delta.CoV (for coloration)

# Skips any data frame with less than 4 columns (i.e. the `all` data frame)

lapply(
  mean.gene.methylation.CoV.list,
  function(df) {
    if (ncol(df) > 3) {
      names <- names(df)
      df %>% ggplot(aes_string(x=names[2], y=names[3], colour=names[7])) +
      geom_point(alpha = 0.6, size = 3) +
      geom_smooth(method = "lm") +
      geom_abline(size=1.0,colour="red") +
      sc
    }
  }
)
```
## Scatter plot all mean gene methylation CoV comparisons

### Colored by mean methylation

Since all data frames are formatted the same (gene.id, comp1.CoV, comp2.CoV, comp1.mean.methylation, comp2.mean.methylation, mean.methylation, abs.delta.CoV) we can refer to them by column numbers.

HOWEVER, to do that, we need to extract the name of each data frame, using the `names()` function.

IMPORTANT: These plots do _not_ plot rows containing `NA` or `NaN`!!! As such, there is
a lot of missing data.

```{r scatter-plot-all-mean-gene-methylation-CoV-by-mean-methylation}

# Find the maximum value across all data frames
max_mean.methylation <- max(sapply(mean.gene.methylation.CoV.list, function(df) max(df$mean.methylation, na.rm = TRUE)), na.rm = TRUE)

# Set color palette
# Not entirely sure how this works, as I took it from some plotting Steven was doing.
myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))

# Set scale colors for absolute values of the delta between comparisons
# Not entirely sure how this works, as I took it from some plotting Steven was doing.
sc <- scale_colour_gradientn(colours = myPalette(100), limits=c(0, max_mean.methylation))

# Generate scatterplots
# Uses the following columns:

# 2 - comp1.CoV
# 3 - comp2.CoV
# 6 - mean.methylation (for coloration)

# Skips any data frame with less than 4 columns (i.e. the `all` data frame)
lapply(
  mean.gene.methylation.CoV.list,
  function(df) {
      if (ncol(df) > 3) {
      names <- names(df)
      df %>% ggplot(aes_string(x=names[2], y=names[3], colour=names[6])) +
      geom_point(alpha = 0.6, size = 3) +
      geom_smooth(method = "lm") +
      geom_abline(size=1.0,colour="red") +
      sc
    }
  }
)
```

## Write CoV data frames to CSVs
```{r write-CoVs-to-CSVs}
# Write data frames to CSVs in ../output/40-gene-methylation/ dir
# Uses names of data frames as names of output files.
sapply(names(mean.gene.methylation.CoV.list),
       function(x) write.csv(mean.gene.methylation.CoV.list[[x]],
                             file = file.path("../output/40-gene-methylation/", paste(x, "CoV-mean-methylation", "csv", sep=".")),
                             quote = FALSE,
                             row.names = FALSE)
       )
```