--- title: "10-cnv-pipeline" author: Steven Roberts date: "`r format(Sys.time(), '%d %B, %Y')`" output: # github_document: # toc: true # toc_depth: 3 # number_sections: true # html_preview: true html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) library(kableExtra) library(DESeq2) library(pheatmap) library(RColorBrewer) library(data.table) library(DT) library(formattable) library(Biostrings) library(spaa) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` ```{r} library(tidyverse) ``` Getting some sample coverage data starting with sorted bams... # Started with a little mox ``` # the following were all done at terminal #grab genome #curl -O https://gannet.fish.washington.edu/panopea/Cg-roslin/cgigas_uk_roslin_v1_genomic-mito.fa # sequence reads # wget -r \ # --no-directories --no-parent \ # -P . \ # -A "F*gz" \ # http://owl.fish.washington.edu/nightingales/C_gigas/ # # # # index genome # # /gscratch/srlab/programs/bowtie2-2.4.2-linux-x86_64/bowtie2-build \ # cgigas_uk_roslin_v1_genomic-mito.fa \ # cgigas_uk_roslin_v1_genomic-mito.index # # # # find *_R1_001_fastq.gz | xargs basename -s _R1_001_fastq.gz \ # | xargs -I{} /Applications/bioinfo/bowtie2-2.2.4/bowtie2 \ # -x cgigas_uk_roslin_v1_genomic-mito.index \ # -1 {}_R1_001_fastq.gz \ # -2 {}_R2_001_fastq.gz \ # -p 40 \ # -S {}.sam # # RAN THIS ON MOX # find *.bam | \ # xargs basename -s .bam | \ # xargs -I{} /gscratch/srlab/programs/samtools-1.10/samtools \ # sort --threads 28 {}.bam \ # -o {}.sorted.bam # find *sorted.bam | \ # xargs basename -s .sorted.bam | \ # xargs -I{} /gscratch/srlab/programs/bedtools-2.27.1/bin/bedtools genomecov \ # -ibam {}.sorted.bam -bg > {}.bedgraph ``` # Bedgraphs and the tidyverse Converting gene gff to bed ```{bash} awk 'OFS="\t" {print $1,$4-1,$5,$9,$6,$7}' ../data/cgigas_uk_roslin_v1_gene.gff > ../output/cgigas_uk_roslin_v1_gene.bed head ../output/cgigas_uk_roslin_v1_gene.bed ``` Cleaning up the bed ```{bash} awk 'BEGIN {OFS=FS="\t"} {sub(/:.*/, "", $4); print}' ../output/cgigas_uk_roslin_v1_gene.bed | sed 's/;Dbxref=GeneID//g' > ../output/cgigas_uk_roslin_v1_gene2.bed ``` ```{r engine='bash', eval=TRUE} head ../output/cgigas_uk_roslin_v1_gene2.bed ``` Getting coverage data for all bedgraphs over genes ```{bash} for f in ../data/*bedgraph do base=$(basename "$f" .bedgraph) /home/shared/bedtools2/bin/bedtools coverage -counts -a ../output/cgigas_uk_roslin_v1_gene2.bed -b "$f" > "../output/$base.coverage.txt" done ``` ```{r engine='bash', eval=TRUE} ls ../output/F*.coverage.txt ``` ```{r engine='bash', eval=TRUE} head ../output/F*.coverage.txt | head ``` # Reading in all files ```{r eval=TRUE, cache=TRUE} # Set the directory path where your files are located directory_path <- "../output/" # Define the regular expression pattern to match your files file_pattern <- "F.*coverage\\.txt" # Get the list of files that match the pattern in the specified directory file_list <- list.files(path = directory_path, pattern = file_pattern, full.names = TRUE) # Function to read and merge files merge_files <- function(file_paths) { # Initialize an empty data frame merged_table <- data.frame() # Loop through each file and merge it with the existing data for (file_path in file_paths) { # Read the file into a data frame table <- read.table(file_path, header = FALSE, sep = "\t") # Assign column names to the data frame colnames(table) <- c("Chr", "Start", "End", "ID", "Dot", "Strand", "Value") # Get the base filename without the extension base_filename <- tools::file_path_sans_ext(basename(file_path)) # Add a new column with the base filename to the data frame table$Filename <- base_filename # Merge the data frame with the existing data merged_table <- rbind(merged_table, table) } # Return the final merged data frame return(merged_table) } # Call the function to merge all the files merged_coverage <- merge_files(file_list) # Print the merged data frame head(merged_coverage) ``` ```{r} save(merged_coverage, file = "../output/merged_coverage.RData") ``` ```{r eval=TRUE} datatable( merged_coverage %>% mutate(gene = str_extract(ID, "LOC\\d+")) %>% mutate(sample = str_extract(Filename, "^[^.]+")) %>% select(gene, sample, value = Value) %>% mutate(family = substr(sample, 1, 4)) ) ``` ```{r eval=TRUE} datatable( merged_coverage %>% mutate(gene = str_extract(ID, "LOC\\d+")) %>% mutate(sample = str_extract(Filename, "^[^.]+")) %>% select(gene, sample, value = Value) %>% mutate(family = substr(sample, 1, 4)) %>% group_by(gene,family) %>% summarize(average_coverage = mean(value, na.rm = TRUE)) ) ``` ```{r eval=TRUE} datatable( merged_coverage %>% mutate(gene = str_extract(ID, "LOC\\d+")) %>% mutate(sample = str_extract(Filename, "^[^.]+")) %>% select(gene, sample, value = Value) %>% mutate(family = substr(sample, 1, 4)) %>% group_by(gene,family) %>% summarize(average_coverage = mean(value, na.rm = TRUE)) %>% pivot_wider(names_from = family, values_from = average_coverage) ) ``` ```{r eval=TRUE} datatable( familydf <- merged_coverage %>% mutate(gene = str_extract(ID, "LOC\\d+")) %>% mutate(sample = str_extract(Filename, "^[^.]+")) %>% select(gene, sample, value = Value) %>% mutate(family = substr(sample, 1, 4)) %>% group_by(gene,family) %>% summarize(average_coverage = mean(value, na.rm = TRUE)) %>% pivot_wider(names_from = family, values_from = average_coverage) ) ``` # genes with coverage higher triploids Meaning likely higher gene counts gene expansion in triploids ```{r eval=TRUE} familydf %>% rowwise() %>% mutate(factor = (mean(c(F052, F142)) / mean(c(F053, F143)))) %>% mutate(f05dif = (log2(F052/F053))) %>% mutate(f14dif = (log2(F142/F143))) %>% mutate(exp = mean(c(F052, F142, F053, F143))) %>% filter(f05dif < -1) %>% filter(f14dif < -1) %>% filter(exp > 50) ``` # genes with coverage higher diploids ```{r eval=TRUE} familydf %>% rowwise() %>% mutate(factor = (mean(c(F052, F142)) / mean(c(F053, F143)))) %>% mutate(f05dif = (log2(F052/F053))) %>% mutate(f14dif = (log2(F142/F143))) %>% mutate(exp = mean(c(F052, F142, F053, F143))) %>% filter(f05dif > 1) %>% filter(f14dif > 1) %>% filter(exp > 50) ```