--- title: "14-blast" output: html_document date: "2024-06-25" --- Rmd to run BLAST on the _Pycnopodia helianthoides_ genome to annotate it and the DEG lists. Will use the example code provided at [Roberts Lab - resources - bio-Annotation](https://robertslab.github.io/resources/bio-Annotation/) Specifically: [https://robertslab.github.io/tusk/modules/04-blast.html](https://robertslab.github.io/tusk/modules/04-blast.html) The genome fasta lives: `../data/ncbi_dataset/data/GCA_032158295.1/GCA_032158295.1_ASM3215829v1_genomic.fna` ```{r} library(Biostrings) library(reshape2) ``` ```{r} #install.packages("reshape2") ``` # Create Database ```{r} current_time <- format(Sys.time(), "%B %d, %Y %H:%M:%S") cat("current date and time is ", current_time) ``` ```{bash} pwd ``` ```{bash} cd ../data curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz mv uniprot_sprot.fasta.gz uniprot_sprot_r2023_04.fasta.gz gunzip -k uniprot_sprot_r2023_04.fasta.gz ``` make the database ```{bash} mkdir ../blastdb /home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \ -in ../data/uniprot_sprot_r2023_04.fasta \ -dbtype prot \ -out ../blastdb/uniprot_sprot_r2023_01 ``` # Query Fasta File as noted above, it lives: `../data/ncbi_dataset/data/GCA_032158295.1/GCA_032158295.1_ASM3215829v1_genomic.fna` ```{bash} head -3 ../data/ncbi_dataset/data/GCA_032158295.1/GCA_032158295.1_ASM3215829v1_genomic.fna ``` ```{bash} echo "How many sequences are there?" grep -c ">" ../data/ncbi_dataset/data/GCA_032158295.1/GCA_032158295.1_ASM3215829v1_genomic.fna ``` 1603 sequences Read in the fasta file: ```{r} # Read FASTA file fasta_file <- "../data/ncbi_dataset/data/GCA_032158295.1/GCA_032158295.1_ASM3215829v1_genomic.fna" # Replace with the name of your FASTA file sequences <- readDNAStringSet(fasta_file) # Calculate sequence lengths sequence_lengths <- width(sequences) # Create a data frame sequence_lengths_df <- data.frame(Length = sequence_lengths) # Plot histogram using ggplot2 ggplot(sequence_lengths_df, aes(x = Length)) + geom_histogram(binwidth = 1, color = "grey", fill = "blue", alpha = 0.75) + labs(title = "Histogram of Sequence Lengths", x = "Sequence Length", y = "Frequency") + theme_minimal() ``` ```{r} # Read FASTA file fasta_file <- "../data/ncbi_dataset/data/GCA_032158295.1/GCA_032158295.1_ASM3215829v1_genomic.fna" sequences <- readDNAStringSet(fasta_file) # Calculate base composition base_composition <- alphabetFrequency(sequences, baseOnly = TRUE) # Convert to data frame and reshape for ggplot2 base_composition_df <- as.data.frame(base_composition) base_composition_df$ID <- rownames(base_composition_df) base_composition_melted <- reshape2::melt(base_composition_df, id.vars = "ID", variable.name = "Base", value.name = "Count") # Plot base composition bar chart using ggplot2 ggplot(base_composition_melted, aes(x = Base, y = Count, fill = Base)) + geom_bar(stat = "identity", position = "dodge", color = "black") + labs(title = "Base Composition", x = "Base", y = "Count") + theme_minimal() + scale_fill_manual(values = c("A" = "green", "C" = "blue", "G" = "yellow", "T" = "red")) ``` ```{r} # Read FASTA file fasta_file <- "../data/ncbi_dataset/data/GCA_032158295.1/GCA_032158295.1_ASM3215829v1_genomic.fna" sequences <- readDNAStringSet(fasta_file) # Count CG motifs in each sequence count_cg_motifs <- function(sequence) { cg_motif <- "CG" return(length(gregexpr(cg_motif, sequence, fixed = TRUE)[[1]])) } cg_motifs_counts <- sapply(sequences, count_cg_motifs) # Create a data frame cg_motifs_counts_df <- data.frame(CG_Count = cg_motifs_counts) # Plot CG motifs distribution using ggplot2 ggplot(cg_motifs_counts_df, aes(x = CG_Count)) + geom_histogram(binwidth = 1, color = "black", fill = "blue", alpha = 0.75) + labs(title = "Distribution of CG Motifs", x = "Number of CG Motifs", y = "Frequency") + theme_minimal() ``` # Run BLASTx ```{bash} /home/shared/ncbi-blast-2.11.0+/bin/blastx \ -query ../data/ncbi_dataset/data/GCA_032158295.1/GCA_032158295.1_ASM3215829v1_genomic.fna \ -db ../blastdb/uniprot_sprot_r2023_01 \ -out ../analyses/14-blast/Phel_genome_blastout.tab \ -evalue 1E-20 \ -num_threads 20 \ -max_target_seqs 1 \ -outfmt 6 ``` ```{r} blast <- read.table("../analyses/14-blast/Phel_genome_blastout.tab") head(blast) ``` Convert pipes to tabs ### Convert pipes (`|`) to tabs. Makes downstream manipulation easier ```{bash convert-pipes-to-tabs} tr '|' '\t' < ../analyses/14-blast/Phel_genome_blastout.tab \ > ../analyses/14-blast/Phel_genome_blastout_sep.tab head ../analyses/14-blast/Phel_genome_blastout_sep.tab ``` # SP GO annotation following: https://rpubs.com/sr320/1026094 ```{r} blastsep <- read.table("../analyses/14-blast/Phel_genome_blastout_sep.tab", sep = '\t') head(blastsep) ``` ```{r} spgo <- read.csv("https://gannet.fish.washington.edu/seashell/snaps/uniprot_table_r2023_01.tab", sep = '\t', header = TRUE) ``` ```{r} install.packages("data.table") ``` ```{r} library(data.table) ``` ```{r} annot_tab <- left_join(blast, spgo, by = c("V3" = "Entry")) %>% select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs) ```