---
title: "GenomicGFF-intersect-meth"
output: html_document
---

#Load necessary libraries
```{r}
library(tidyverse)
```


 # CODE TO GET METHYLATION INTERSECT WITH Genomic GFF


```{bash}
 cd /home/shared/8TB_HDD_01/sr320/github/ceabigr/data

 FILES=$(ls *bedgraph)

 for file in ${FILES}
 do
    NAME=$(echo ${file} | awk -F "_" '{print $1}')
    echo ${NAME}

   /home/shared/bedtools2/bin/intersectBed \
   -wb \
   -a ${NAME}_R1_val_1_10x.bedgraph \
   -b ~/ceabigr/genome-features/GCF_002022765.2_C_virginica-3.0_genomic.gff |\
  awk -F"\t" '{OFS=FS="\t"}{sub(/.*;gene=/,"",$13);sub(/;.*/,"",$13);print $1FS$2FS$3FS$4FS$5FS$8FS$9FS$7FS$13}' \
   > ~/ceabigr/output/${NAME}_mGFF.out
 done  
```
 # Get gene percent methylation by calculating the mean and median Loci percent methylation for each sample

#Want to see unique transposable elements through 26 different files/samples 
```{r}
# create a vector of filenames with full path 
filenames <- list.files(path = "~/ceabigr/output", pattern = "mGFF.out", full.names = TRUE)  

c <- data.frame()   # create empty dataframe to be populated with mean & median summary stats for each feature within a sample 

for (i in 1:length(filenames)) {  
  print(filenames[i])  # print out file location and name 
  testMte <- read.csv(file = filenames[i], sep = "\t", header = FALSE) # read in each sample data 
    # summarize methylation data per feature, chromosome, start, and end position with mean & median 
    group12 <- testMte %>% group_by(V5,V6,V7,V8,V9) %>%   
    summarize(avg = mean(V4, na.rm=TRUE), median=median(V4, na.rm=TRUE)) %>%

    # add new column with sample name 
    mutate(sample=gsub("/home/shared/8TB_HDD_02/strigg/ceabigr/output/", "", filenames[i]))
   # combine feature mean and median methylation for each sample in one common dataframe called b
  c <- rbind(c, group12)
}

#rename columns
colnames(c) <- c("chr","start","stop","feature_type","feature_name","mean_meth", "median_meth", "sample")

#clean up sample name
c$sample <- gsub("_mGFF.out","", c$sample)


#add column with feature name_feature_type
c$feature <- paste0(c$feature_name,"_",c$feature_type)

#order by feature so that features can be numbered correctly
c <- c[order(c$feature_type,c$chr,c$start,c$stop,c$feature_name),]

#add feature numbering
feature.rle <- rle(c$feature)

c$feature <- paste0(rep(feature.rle$values, times = feature.rle$lengths), "_",unlist(lapply(feature.rle$lengths, seq_len)))

#write out dataframe
write.table(c, file = "~/ceabigr/output/GFF_summary_allsamples.txt",quote = F, row.names = F, sep = "\t")
```