--- title: "05 Ptua bismark" author: Steven Roberts date: "`r format(Sys.time(), '%d %B, %Y')`" output: html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` Shelly did alignment ``` # run pipeline nextflow run nf-core/methylseq \ -c /gscratch/srlab/strigg/bin/uw_hyak_srlab.config \ --input /gscratch/scrubbed/strigg/analyses/20250422_methylseq/samplesheet.csv \ --outdir /gscratch/scrubbed/strigg/analyses/20250422_methylseq \ --fasta /gscratch/srlab/strigg/GENOMES/Pocillopora_meandrina_HIv1.assembly.fasta \ --em_seq \ -resume \ -with-report nf_report.html \ -with-trace \ -with-timeline nf_timeline.html \ --skip_trimming \ --nomeseq ### Results - Multiqc report: [https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/multiqc/bismark/multiqc_report.html](https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/multiqc/bismark/multiqc_report.html) - Bismark summary report: [https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/bismark/summary/bismark_summary_report.html](https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/bismark/summary/bismark_summary_report.html) - Pipeline report: [https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/nf_report.html](https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/nf_report.html) - Pipeline timeline: [https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/nf_timeline.html](https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/nf_timeline.html) - Counts matrices: [https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/bismark/methylation_calls/methylation_coverage/](https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/bismark/methylation_calls/methylation_coverage/) - .fastp-trim_bismark_bt2_pe.deduplicated.bismark.cov.gz - Deduplicated sorted bam files: [https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/bismark/deduplicated/](https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/bismark/deduplicated/) - .deduplicated.sorted.bam - Other bismark output: [https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/bismark/](https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/bismark/) ``` ## Downloading deduplcated coveage files ```{r,engine='bash'} cd ../data/dedup-cov wget -r -np -nH --cut-dirs=5 --reject "index.html*" https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/bismark/methylation_calls/methylation_coverage/ ``` # Methylation call ``` find ../output/09-meth-quant/*deduplicated.bismark.cov.gz | \ xargs -n 1 basename -s _pe.deduplicated.bismark.cov.gz | \ parallel -j 48 /home/shared/Bismark-0.24.0/coverage2cytosine \ --genome_folder ../data/ \ -o ../output/09-meth-quant/{} \ --merge_CpG \ --zero_based \ ../output/09-meth-quant/{}_pe.deduplicated.bismark.cov.gz ``` ```{r,engine='bash'} cd ../data curl -O https://gannet.fish.washington.edu/gitrepos/urol-e5/timeseries_molecular/F-Ptua/data/Bisulfite_Genome.tar.gz ``` ```{r, engine='bash'} tar -xzvf ../data/Bisulfite_Genome.tar.gz ``` ```{r,engine='bash'} cd ../data/bs curl -O https://owl.fish.washington.edu/halfshell/genomic-databank/Pocillopora_meandrina_HIv1.assembly.fasta ``` ```{bash} find ../data/dedup-cov/methylation_calls/methylation_coverage/*deduplicated.bismark.cov.gz | \ xargs -n 1 basename -s R1_001.fastp-trim_bismark_bt2_pe.deduplicated.bismark.cov.gz | \ parallel -j 24 /home/shared/Bismark-0.24.0/coverage2cytosine \ --genome_folder ../data/bs \ -o ../output/05-Ptua-bismark-CG/{} \ --merge_CpG \ --zero_based \ ../data/dedup-cov/methylation_calls/methylation_coverage/{}R1_001.fastp-trim_bismark_bt2_pe.deduplicated.bismark.cov.gz ``` ```{bash} head ../output/05-Ptua-bismark-CG/*53*evidence.cov ``` # Converting cov files to other typesl ```{bash} cd ../output/05-Ptua-bismark-CG/ for f in *merged_CpG_evidence.cov do STEM=$(basename "${f}" _.CpG_report.merged_CpG_evidence.cov) cat "${f}" | awk -F $'\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 5) {print $1, $2, $3, $4}}' \ > "${STEM}"_10x.bedgraph done ``` # take all 10x bedgraph and given CG loci unique ID ```{bash} for file in ../output/05-Ptua-bismark-CG/*10x.bedgraph; do awk '{print "CpG_"_$1"_"$2, $4}' "$file" > "${file%.bedgraph}_processed.txt" done ``` ```{bash} for file in ../output/05-Ptua-bismark-CG/*10x_processed.txt; do head -1 $file done ``` ```{r, engine='bash'} head ../output/05-Ptua-bismark-CG/*57-TP4*txt ``` # Load libraries ```{r, warning=FALSE, message=FALSE} library(tidyverse) library(ggplot2) library(DESeq2) library(igraph) library(psych) library(tidygraph) library(ggraph) library(WGCNA) library(edgeR) library(reshape2) library(ggcorrplot) library(corrplot) library(rvest) library(purrr) library(pheatmap) library(glmnet) library(caret) library(factoextra) library(vegan) library(ggfortify) library(genefilter) library(scales) library(purrr) ``` ## WGBS data ```{r, eval=FALSE} #pull processed files from Gannet # Note: Unfortunately we can't use the `cache` feature to make this process more time efficient, as it doesn't support long vectors # Define the base URL base_url <- "https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/F-Ptua/output/05-Ptua-bismark-CG/" # Read the HTML page page <- read_html(base_url) # Extract links to files file_links <- page %>% html_nodes("a") %>% html_attr("href") # Filter for files ending in "processed.txt" processed_files <- file_links[grepl("processed\\.txt$", file_links)] # Create full URLs file_urls <- paste0(base_url, processed_files) # Function to read a file from URL read_processed_file <- function(url) { read_table(url, col_types = cols(.default = "c")) # Read as character to avoid parsing issues } # Import all processed files into a list processed_data <- lapply(file_urls, read_processed_file) # Name the list elements by file name names(processed_data) <- processed_files # Print structure of imported data str(processed_data) # add a header row that has "CpG" for the first column and "sample" for the second column, which will be populated by the file name processed_data <- Map(function(df, filename) { colnames(df) <- c("CpG", filename) # Rename columns return(df) }, processed_data, names(processed_data)) # Use stored file names #merge files together by "CpG" merged_data <- purrr::reduce(processed_data, full_join, by = "CpG") # Print structure of final merged data str(merged_data) ``` Replace any NA with 0. ```{r, eval=FALSE} # Convert all columns (except "CpG") to numeric and replace NAs with 0 merged_data <- merged_data %>% mutate(across(-CpG, as.numeric)) %>% # Convert all except CpG to numeric mutate(across(-CpG, ~ replace_na(.x, 0))) # Replace NA with 0 in numeric columns ``` ## Filter data sets Only keep CpGs that have a non-zero value in all samples. ```{r, eval=FALSE} filtered_wgbs <- merged_data %>% filter(if_all(-CpG, ~ .x > 0)) # Ensure it's formatted as a data frame filtered_wgbs <- as.data.frame(filtered_wgbs) # Only keep the sample information in the column name. colnames(filtered_wgbs) <- gsub("^(.*?)_.*$", "\\1", colnames(filtered_wgbs)) # Set CpG IDs to rownames rownames(filtered_wgbs) <- filtered_wgbs$CpG filtered_wgbs <- filtered_wgbs %>% select(-CpG) nrow(merged_data) nrow(filtered_wgbs) ``` We had xxxxxx xxCpGs before filtering and have only xxxxx after filtering. This makes sense because most CpGs were not methylated in all samples. Save filtered set to make code reruns/knitting quicker ```{r, eval=FALSE} write.csv(filtered_wgbs, "../output/05-Ptua-bismark-CG/filtered-WGBS-CpG-counts.csv") ``` ```{r, engine='bash'} head ../output/05-Ptua-bismark-CG/filtered-WGBS-CpG-counts.csv ``` ```{r, engine='bash'} wc -l ../output/05-Ptua-bismark-CG/filtered-WGBS-CpG-counts.csv ```