Code to annotate our P. evermanni reference files (the P.evermanni transcriptome and genome) with GO information

1 Transcriptome

1.1 Retrieve transcriptome fasta file

We’ll be using the P. evermanni genes fasta file constructed using the P. evermanni gff and scaffold fasta, stored here. Accessible on the deep-dive genomic resources page, including links to relevant code.

curl https://gannet.fish.washington.edu/kdurkin1/deep-dive/E-Peve/data/Porites_evermanni_CDS.fasta \
-k \
> ../../data/Porites_evermanni_CDS.fasta

Let’s check the file

echo "First few lines:"
head -3 ../../data/Porites_evermanni_CDS.fasta

echo ""
echo "How many sequences are there?"
grep -c ">" ../../data/Porites_evermanni_CDS.fasta
## First few lines:
## >Parent=Peve_00000001 Porites_evermani_scaffold_1:3106-3444,4283-4488
## TTACTGCTTCAGTATGTGAATTTCGATGGTGGCTTGACCGGAGTTAGACATGGCCCCCCTTGCTCGGAGTCCCGATCCCAAATCTCTCTCGTGCCAGTAGTTATCTCCTTTGAAGGGATTATCGTAATACATCCGGTACCAAAGATTGTAGTTGGCGCGTCTTTTACCACTGTAAAGCCAAACATCAAACCAGTTGCTGTACCAGTTGTAGTCAAATGGAACGGAATACATAACAGCAAGTGTTTTATCGATGTGTGGGATGTAGTATGTTAATACTCCTACTGCGCCTCTCGCAACGGGCCCCGCAGTTTTTCGTGCGCCGTAAAGCAAAGCTGTGCCTGAAGAAACATCATGAGGCAAAACACGATTTGACGTCCCTGAATAAAAATATATGTTGACTGCTCTCCATTTATATCCACTTTCGTTATCAACACCAATGGCGACCTTGCGGCTTATGCTACCAAGGGTGTTTAAAATTGTTGTGAGAATGCCCAAGCCAAGTTGAGCACCGCTGATGACAGCACCAGCGTCAGCTAAAATTTT
## >Parent=Peve_00000002 Porites_evermani_scaffold_1:424478-425361,426180-426735,427012-427140,427664-427724,428641-429034
## 
## How many sequences are there?
## 40389

For simplicity, let’s reduce the sequence names to just the unique identifier “Parent=Peve_########”

# Read FASTA file
fasta_file <- "../../data/Porites_evermanni_CDS.fasta"  # Replace with the name of your FASTA file
sequences <- readDNAStringSet(fasta_file)

# For simplicity, let's reduce the sequence names to just the unique identifier "Parent=Peve_########"
names(sequences) <- gsub("^(Parent=Peve_\\d+).*", "\\1", names(sequences))
head(names(sequences))
## [1] "Parent=Peve_00000001" "Parent=Peve_00000002" "Parent=Peve_00000003"
## [4] "Parent=Peve_00000004" "Parent=Peve_00000005" "Parent=Peve_00000006"
# Calculate sequence lengths
sequence_lengths <- width(sequences)

# Create a data frame
sequence_lengths_df <- data.frame(Length = sequence_lengths)

# Plot histogram using ggplot2
ggplot(sequence_lengths_df, aes(x = Length)) +
  geom_histogram(binwidth = 1, color = "black", fill = "blue", alpha = 0.75) +
  labs(title = "Histogram of Sequence Lengths",
       x = "Sequence Length",
       y = "Frequency") +
  theme_minimal()

summary(sequence_lengths_df)
##      Length     
##  Min.   :  102  
##  1st Qu.:  519  
##  Median :  933  
##  Mean   : 1338  
##  3rd Qu.: 1617  
##  Max.   :63597
# Calculate base composition
base_composition <- alphabetFrequency(sequences, baseOnly = TRUE)

# Convert to data frame and reshape for ggplot2
base_composition_df <- as.data.frame(base_composition)
base_composition_df$ID <- rownames(base_composition_df)
base_composition_melted <- reshape2::melt(base_composition_df, id.vars = "ID", variable.name = "Base", value.name = "Count")

# Plot base composition bar chart using ggplot2
ggplot(base_composition_melted, aes(x = Base, y = Count, fill = Base)) +
  geom_bar(stat = "identity", position = "dodge", color = "black") +
  labs(title = "Base Composition",
       x = "Base",
       y = "Count") +
  theme_minimal() +
  scale_fill_manual(values = c("A" = "green", "C" = "blue", "G" = "yellow", "T" = "red"))

# Count CG motifs in each sequence
count_cg_motifs <- function(sequence) {
  cg_motif <- "CG"
  return(length(gregexpr(cg_motif, sequence, fixed = TRUE)[[1]]))
}

cg_motifs_counts <- sapply(sequences, count_cg_motifs)

# Create a data frame
cg_motifs_counts_df <- data.frame(CG_Count = cg_motifs_counts)

# Plot CG motifs distribution using ggplot2
ggplot(cg_motifs_counts_df, aes(x = CG_Count)) +
  geom_histogram(binwidth = 1, color = "black", fill = "blue", alpha = 0.75) +
  labs(title = "Distribution of CG Motifs",
       x = "Number of CG Motifs",
       y = "Frequency") +
  theme_minimal()

1.2 Database Creation

1.2.1 Obtain Fasta (UniProt/Swiss-Prot)

cd ../../data
curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
mv uniprot_sprot.fasta.gz uniprot_sprot_r2023_04.fasta.gz
gunzip -k uniprot_sprot_r2023_04.fasta.gz

1.2.2 Making the database

/home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \
-in ../../data/uniprot_sprot_r2023_04.fasta \
-dbtype prot \
-out ../../blastdb/uniprot_sprot_r2023_04

1.3 Running Blastx

/home/shared/ncbi-blast-2.11.0+/bin/blastx \
-query ../../data/Porites_evermanni_CDS.fasta \
-db ../../blastdb/uniprot_sprot_r2023_04 \
-out ../output/02-Peve-reference-annotation/Porites_evermanni_CDS-uniprot_blastx.tab \
-evalue 1E-20 \
-num_threads 20 \
-max_target_seqs 1 \
-outfmt 6
echo "First few lines:"
head -2 ../output/02-Peve-reference-annotation/Porites_evermanni_CDS-uniprot_blastx.tab

echo "Number of lines in output:"
wc -l ../output/02-Peve-reference-annotation/Porites_evermanni_CDS-uniprot_blastx.tab
## First few lines:
## Parent=Peve_00000001 sp|P61915|ACTPC_ACTTE   57.714  175 72  2   528 7   40  213 2.22e-58    184
## Parent=Peve_00000002 sp|Q569C3|UBP1_RAT  48.000  200 94  5   1287    703 407 601 1.43e-45    177
## Number of lines in output:
## 25243 ../output/02-Peve-reference-annotation/Porites_evermanni_CDS-uniprot_blastx.tab

1.4 Joining Blast table with annoations.

1.4.1 Prepping Blast table for easy join

tr '|' '\t' < ../output/02-Peve-reference-annotation/Porites_evermanni_CDS-uniprot_blastx.tab \
> ../output/02-Peve-reference-annotation/Porites_evermanni_CDS-uniprot_blastx_sep.tab

head -1 ../output/02-Peve-reference-annotation/Porites_evermanni_CDS-uniprot_blastx_sep.tab
## Parent=Peve_00000001 sp  P61915  ACTPC_ACTTE 57.714  175 72  2   528 7   40  213 2.22e-58    184

1.4.2 Could do some cool stuff in R here reading in table

bltabl <- read.csv("../output/02-Peve-reference-annotation/Porites_evermanni_CDS-uniprot_blastx_sep.tab", sep = '\t', header = FALSE)

spgo <- read.csv("https://gannet.fish.washington.edu/seashell/snaps/uniprot_table_r2023_01.tab", sep = '\t', header = TRUE)

datatable(head(bltabl), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE))
datatable(head(spgo), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE))
datatable(
  left_join(bltabl, spgo,  by = c("V3" = "Entry")) %>%
  select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs)
)
annot_tab <-
  left_join(bltabl, spgo,  by = c("V3" = "Entry")) %>%
  select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs)

write.table(annot_tab, file = "../output/02-Peve-reference-annotation/Porites_evermanni_CDS-IDmapping-2024_09_04.tab", sep = "\t",
            row.names = TRUE, col.names = NA)
head -n 3 ../output/02-Peve-reference-annotation/Porites_evermanni_CDS-IDmapping-2024_09_04.tab
# Read dataset
#dataset <- read.csv("../output/blast_annot_go.tab", sep = '\t')  # Replace with the path to your dataset

# Select the column of interest
column_name <- "Organism"  # Replace with the name of the column of interest
column_data <- annot_tab[[column_name]]

# Count the occurrences of the strings in the column
string_counts <- table(column_data)

# Convert to a data frame, sort by count, and select the top 10
string_counts_df <- as.data.frame(string_counts)
colnames(string_counts_df) <- c("String", "Count")
string_counts_df <- string_counts_df[order(string_counts_df$Count, decreasing = TRUE), ]
top_10_strings <- head(string_counts_df, n = 10)

# Plot the top 10 most common strings using ggplot2
ggplot(top_10_strings, aes(x = reorder(String, -Count), y = Count, fill = String)) +
  geom_bar(stat = "identity", position = "dodge", color = "black") +
  labs(title = "Top 10 Species hits",
       x = column_name,
       y = "Count") +
  theme_minimal() +
  theme(legend.position = "none") +
  coord_flip()

#data <- read.csv("../output/blast_annot_go.tab", sep = '\t')

# Rename the `Gene.Ontology..biological.process.` column to `Biological_Process`
colnames(annot_tab)[colnames(annot_tab) == "Gene.Ontology..biological.process."] <- "Biological_Process"

# Separate the `Biological_Process` column into individual biological processes
data_separated <- unlist(strsplit(annot_tab$Biological_Process, split = ";"))

# Trim whitespace from the biological processes
data_separated <- gsub("^\\s+|\\s+$", "", data_separated)

# Count the occurrences of each biological process
process_counts <- table(data_separated)
process_counts <- data.frame(Biological_Process = names(process_counts), Count = as.integer(process_counts))
process_counts <- process_counts[order(-process_counts$Count), ]

# Select the 20 most predominant biological processes
top_20_processes <- process_counts[1:20, ]

# Create a color palette for the bars
bar_colors <- rainbow(nrow(top_20_processes))

# Create a staggered vertical bar plot with different colors for each bar
barplot(top_20_processes$Count, names.arg = rep("", nrow(top_20_processes)), col = bar_colors,
        ylim = c(0, max(top_20_processes$Count) * 1.25),
        main = "Occurrences of the 20 Most Predominant Biological Processes", xlab = "Biological Process", ylab = "Count")

# Create a separate plot for the legend
png("../output/02-Peve-reference-annotation/GOlegend.png", width = 800, height = 600)
par(mar = c(0, 0, 0, 0))
plot.new()
legend("center", legend = top_20_processes$Biological_Process, fill = bar_colors, cex = 1, title = "Biological Processes")
dev.off()
## png 
##   2
knitr::include_graphics("../output/02-Peve-reference-annotation/GOlegend.png")

rm ../output/02-Peve-reference-annotation/GOlegend.png
---
title: "02-Peve-reference-annotation"
author: "Kathleen Durkin"
date: "2024-09-04"
always_allow_html: true
output: 
  bookdown::html_document2:
    theme: cosmo
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
  github_document:
    toc: true
    toc_depth: 3
    number_sections: true
    html_preview: true 
---

```{r setup, include=FALSE}
library(knitr)
library(tidyverse)
library(kableExtra)
library(DT)
library(Biostrings)
library(tm)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,         # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center" # Align plots to the center
)
```

Code to annotate our *P. evermanni* reference files (the *P.evermanni* transcriptome and genome) with GO information

# Transcriptome
## Retrieve transcriptome fasta file

We'll be using the *P. evermanni* genes fasta file constructed using the *P. evermanni* gff and scaffold fasta, stored [here](https://gannet.fish.washington.edu/kdurkin1/deep-dive/E-Peve/data/Porites_evermanni_CDS.fasta). Accessible on the `deep-dive` [genomic resources page](https://github.com/urol-e5/deep-dive/wiki/Species-Characteristics-and-Genomic-Resources#genomic-resources), including links to relevant code.

```{r download-transcriptome, engine='bash'}
curl https://gannet.fish.washington.edu/kdurkin1/deep-dive/E-Peve/data/Porites_evermanni_CDS.fasta \
-k \
> ../../data/Porites_evermanni_CDS.fasta
```

Let's check the file

```{r transcriptome-view-query, engine='bash', eval=TRUE}
echo "First few lines:"
head -3 ../../data/Porites_evermanni_CDS.fasta

echo ""
echo "How many sequences are there?"
grep -c ">" ../../data/Porites_evermanni_CDS.fasta
```

For simplicity, let's reduce the sequence names to just the unique identifier "Parent=Peve_########"


```{r transcriptome-seqlength-histogram, eval=TRUE}
# Read FASTA file
fasta_file <- "../../data/Porites_evermanni_CDS.fasta"  # Replace with the name of your FASTA file
sequences <- readDNAStringSet(fasta_file)

# For simplicity, let's reduce the sequence names to just the unique identifier "Parent=Peve_########"
names(sequences) <- gsub("^(Parent=Peve_\\d+).*", "\\1", names(sequences))
head(names(sequences))

# Calculate sequence lengths
sequence_lengths <- width(sequences)

# Create a data frame
sequence_lengths_df <- data.frame(Length = sequence_lengths)

# Plot histogram using ggplot2
ggplot(sequence_lengths_df, aes(x = Length)) +
  geom_histogram(binwidth = 1, color = "black", fill = "blue", alpha = 0.75) +
  labs(title = "Histogram of Sequence Lengths",
       x = "Sequence Length",
       y = "Frequency") +
  theme_minimal()

summary(sequence_lengths_df)
```

```{r transcriptome-ACGT-composition, eval=TRUE}

# Calculate base composition
base_composition <- alphabetFrequency(sequences, baseOnly = TRUE)

# Convert to data frame and reshape for ggplot2
base_composition_df <- as.data.frame(base_composition)
base_composition_df$ID <- rownames(base_composition_df)
base_composition_melted <- reshape2::melt(base_composition_df, id.vars = "ID", variable.name = "Base", value.name = "Count")

# Plot base composition bar chart using ggplot2
ggplot(base_composition_melted, aes(x = Base, y = Count, fill = Base)) +
  geom_bar(stat = "identity", position = "dodge", color = "black") +
  labs(title = "Base Composition",
       x = "Base",
       y = "Count") +
  theme_minimal() +
  scale_fill_manual(values = c("A" = "green", "C" = "blue", "G" = "yellow", "T" = "red"))
```


```{r transcriptome-cg-motifs, eval=TRUE}

# Count CG motifs in each sequence
count_cg_motifs <- function(sequence) {
  cg_motif <- "CG"
  return(length(gregexpr(cg_motif, sequence, fixed = TRUE)[[1]]))
}

cg_motifs_counts <- sapply(sequences, count_cg_motifs)

# Create a data frame
cg_motifs_counts_df <- data.frame(CG_Count = cg_motifs_counts)

# Plot CG motifs distribution using ggplot2
ggplot(cg_motifs_counts_df, aes(x = CG_Count)) +
  geom_histogram(binwidth = 1, color = "black", fill = "blue", alpha = 0.75) +
  labs(title = "Distribution of CG Motifs",
       x = "Number of CG Motifs",
       y = "Frequency") +
  theme_minimal()
```

## Database Creation

### Obtain Fasta (UniProt/Swiss-Prot)

```{r download-UniPSwissP-data, engine='bash'}
cd ../../data
curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
mv uniprot_sprot.fasta.gz uniprot_sprot_r2023_04.fasta.gz
gunzip -k uniprot_sprot_r2023_04.fasta.gz
```

### Making the database

```{r make-UniPSwissP-blastdb, engine='bash'}
/home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \
-in ../../data/uniprot_sprot_r2023_04.fasta \
-dbtype prot \
-out ../../blastdb/uniprot_sprot_r2023_04
```


## Running Blastx

```{r transcriptome-blastx, engine='bash'}
/home/shared/ncbi-blast-2.11.0+/bin/blastx \
-query ../../data/Porites_evermanni_CDS.fasta \
-db ../../blastdb/uniprot_sprot_r2023_04 \
-out ../output/02-Peve-reference-annotation/Porites_evermanni_CDS-uniprot_blastx.tab \
-evalue 1E-20 \
-num_threads 20 \
-max_target_seqs 1 \
-outfmt 6
```

```{r transcriptome-blast-look, engine='bash', eval=TRUE}
echo "First few lines:"
head -2 ../output/02-Peve-reference-annotation/Porites_evermanni_CDS-uniprot_blastx.tab

echo "Number of lines in output:"
wc -l ../output/02-Peve-reference-annotation/Porites_evermanni_CDS-uniprot_blastx.tab
```


## Joining Blast table with annoations.

### Prepping Blast table for easy join

```{r transcriptome-separate, engine='bash', eval=TRUE}
tr '|' '\t' < ../output/02-Peve-reference-annotation/Porites_evermanni_CDS-uniprot_blastx.tab \
> ../output/02-Peve-reference-annotation/Porites_evermanni_CDS-uniprot_blastx_sep.tab

head -1 ../output/02-Peve-reference-annotation/Porites_evermanni_CDS-uniprot_blastx_sep.tab

```

### Could do some cool stuff in R here reading in table

```{r transcriptome-read-data, eval=TRUE, cache=TRUE}
bltabl <- read.csv("../output/02-Peve-reference-annotation/Porites_evermanni_CDS-uniprot_blastx_sep.tab", sep = '\t', header = FALSE)

spgo <- read.csv("https://gannet.fish.washington.edu/seashell/snaps/uniprot_table_r2023_01.tab", sep = '\t', header = TRUE)

datatable(head(bltabl), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE))
```

```{r transcriptome-spgo-table, eval=TRUE}
datatable(head(spgo), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE))
```

```{r transcriptome-see, eval=TRUE}
datatable(
  left_join(bltabl, spgo,  by = c("V3" = "Entry")) %>%
  select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs)
)
```

```{r transcriptome-join, eval=TRUE}
annot_tab <-
  left_join(bltabl, spgo,  by = c("V3" = "Entry")) %>%
  select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs)

write.table(annot_tab, file = "../output/02-Peve-reference-annotation/Porites_evermanni_CDS-IDmapping-2024_09_04.tab", sep = "\t",
            row.names = TRUE, col.names = NA)
```

```{r transcriptome-view-headers, engine='bash'}
head -n 3 ../output/02-Peve-reference-annotation/Porites_evermanni_CDS-IDmapping-2024_09_04.tab
```

```{r transcriptome-species-hits, eval=TRUE}
# Read dataset
#dataset <- read.csv("../output/blast_annot_go.tab", sep = '\t')  # Replace with the path to your dataset

# Select the column of interest
column_name <- "Organism"  # Replace with the name of the column of interest
column_data <- annot_tab[[column_name]]

# Count the occurrences of the strings in the column
string_counts <- table(column_data)

# Convert to a data frame, sort by count, and select the top 10
string_counts_df <- as.data.frame(string_counts)
colnames(string_counts_df) <- c("String", "Count")
string_counts_df <- string_counts_df[order(string_counts_df$Count, decreasing = TRUE), ]
top_10_strings <- head(string_counts_df, n = 10)

# Plot the top 10 most common strings using ggplot2
ggplot(top_10_strings, aes(x = reorder(String, -Count), y = Count, fill = String)) +
  geom_bar(stat = "identity", position = "dodge", color = "black") +
  labs(title = "Top 10 Species hits",
       x = column_name,
       y = "Count") +
  theme_minimal() +
  theme(legend.position = "none") +
  coord_flip()
```

```{r transcriptome-top-go, eval=TRUE}

#data <- read.csv("../output/blast_annot_go.tab", sep = '\t')

# Rename the `Gene.Ontology..biological.process.` column to `Biological_Process`
colnames(annot_tab)[colnames(annot_tab) == "Gene.Ontology..biological.process."] <- "Biological_Process"

# Separate the `Biological_Process` column into individual biological processes
data_separated <- unlist(strsplit(annot_tab$Biological_Process, split = ";"))

# Trim whitespace from the biological processes
data_separated <- gsub("^\\s+|\\s+$", "", data_separated)

# Count the occurrences of each biological process
process_counts <- table(data_separated)
process_counts <- data.frame(Biological_Process = names(process_counts), Count = as.integer(process_counts))
process_counts <- process_counts[order(-process_counts$Count), ]

# Select the 20 most predominant biological processes
top_20_processes <- process_counts[1:20, ]

# Create a color palette for the bars
bar_colors <- rainbow(nrow(top_20_processes))

# Create a staggered vertical bar plot with different colors for each bar
barplot(top_20_processes$Count, names.arg = rep("", nrow(top_20_processes)), col = bar_colors,
        ylim = c(0, max(top_20_processes$Count) * 1.25),
        main = "Occurrences of the 20 Most Predominant Biological Processes", xlab = "Biological Process", ylab = "Count")


# Create a separate plot for the legend
png("../output/02-Peve-reference-annotation/GOlegend.png", width = 800, height = 600)
par(mar = c(0, 0, 0, 0))
plot.new()
legend("center", legend = top_20_processes$Biological_Process, fill = bar_colors, cex = 1, title = "Biological Processes")
dev.off()
```

```{r transcriptome-go-legend, eval=TRUE, fig.width = 100 ,fig.height = 100}
knitr::include_graphics("../output/02-Peve-reference-annotation/GOlegend.png")
```

```{r transcriptome-remove-legend-file, engine='bash', eval=TRUE}
rm ../output/02-Peve-reference-annotation/GOlegend.png
```