Code to annotate our A. pulchra reference files (the A. millipora transcriptome and genome) with GO information
We’ll be using the A. millipora NCBI rna.fna file, stored here and accessible on the deep-dive
genomic resources page
curl https://gannet.fish.washington.edu/acropora/E5-deep-dive/Transcripts/Apul_GCF_013753865.1_rna.fna \
-k \
> ../../data/Apul_GCF_013753865.1_rna.fna
Let’s check the file
echo "First few lines:"
head -3 ../../data/Apul_GCF_013753865.1_rna.fna
echo ""
echo "How many sequences are there?"
grep -c ">" ../../data/Apul_GCF_013753865.1_rna.fna
## First few lines:
## >XM_029323402.2 PREDICTED: Acropora millepora lipase ZK262.3-like (LOC114977611), mRNA
## GAAAGACCCTGGGAACGAGGTTGCAGGTTTTCCTAATGTTAATCTCGGTAATTGAAAAGGTTGGACTTTTGGAAGCGAGA
## ATTCAACGAAAAATTCATAATAAAATTAAGTGGGGCGGATCGACCTTGATGATGTGGGGCGGAACGATTGTAATTCCGTC
##
## How many sequences are there?
## 50570
# Read FASTA file
fasta_file <- "../../data/Apul_GCF_013753865.1_rna.fna" # Replace with the name of your FASTA file
sequences <- readDNAStringSet(fasta_file)
# Calculate sequence lengths
sequence_lengths <- width(sequences)
# Create a data frame
sequence_lengths_df <- data.frame(Length = sequence_lengths)
# Plot histogram using ggplot2
ggplot(sequence_lengths_df, aes(x = Length)) +
geom_histogram(binwidth = 1, color = "black", fill = "blue", alpha = 0.75) +
labs(title = "Histogram of Sequence Lengths",
x = "Sequence Length",
y = "Frequency") +
theme_minimal()
summary(sequence_lengths_df)
## Length
## Min. : 66
## 1st Qu.: 1077
## Median : 1778
## Mean : 2218
## 3rd Qu.: 2811
## Max. :65009
# Calculate base composition
base_composition <- alphabetFrequency(sequences, baseOnly = TRUE)
# Convert to data frame and reshape for ggplot2
base_composition_df <- as.data.frame(base_composition)
base_composition_df$ID <- rownames(base_composition_df)
base_composition_melted <- reshape2::melt(base_composition_df, id.vars = "ID", variable.name = "Base", value.name = "Count")
# Plot base composition bar chart using ggplot2
ggplot(base_composition_melted, aes(x = Base, y = Count, fill = Base)) +
geom_bar(stat = "identity", position = "dodge", color = "black") +
labs(title = "Base Composition",
x = "Base",
y = "Count") +
theme_minimal() +
scale_fill_manual(values = c("A" = "green", "C" = "blue", "G" = "yellow", "T" = "red"))
# Count CG motifs in each sequence
count_cg_motifs <- function(sequence) {
cg_motif <- "CG"
return(length(gregexpr(cg_motif, sequence, fixed = TRUE)[[1]]))
}
cg_motifs_counts <- sapply(sequences, count_cg_motifs)
# Create a data frame
cg_motifs_counts_df <- data.frame(CG_Count = cg_motifs_counts)
# Plot CG motifs distribution using ggplot2
ggplot(cg_motifs_counts_df, aes(x = CG_Count)) +
geom_histogram(binwidth = 1, color = "black", fill = "blue", alpha = 0.75) +
labs(title = "Distribution of CG Motifs",
x = "Number of CG Motifs",
y = "Frequency") +
theme_minimal()
cd ../../data
curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
mv uniprot_sprot.fasta.gz uniprot_sprot_r2023_04.fasta.gz
gunzip -k uniprot_sprot_r2023_04.fasta.gz
/home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \
-in ../../data/uniprot_sprot_r2023_04.fasta \
-dbtype prot \
-out ../../blastdb/uniprot_sprot_r2023_04
/home/shared/ncbi-blast-2.11.0+/bin/blastx \
-query ../../data/Apul_GCF_013753865.1_rna.fna \
-db ../../blastdb/uniprot_sprot_r2023_04 \
-out ../output/02-Apul-reference-annotation/Apul_GCF_013753865.1_rna-uniprot_blastx.tab \
-evalue 1E-20 \
-num_threads 20 \
-max_target_seqs 1 \
-outfmt 6
echo "First few lines:"
head -2 ../output/02-Apul-reference-annotation/Apul_GCF_013753865.1_rna-uniprot_blastx.tab
echo "Number of lines in output:"
wc -l ../output/02-Apul-reference-annotation/Apul_GCF_013753865.1_rna-uniprot_blastx.tab
## First few lines:
## XM_029323402.2 sp|Q9XTR8|LIP1_CAEEL 31.321 265 157 5 578 1306 61 322 3.59e-25 111
## XM_029323410.2 sp|Q9NUQ6|SPS2L_HUMAN 32.447 376 201 8 280 1284 9 372 7.10e-39 157
## Number of lines in output:
## 31701 ../output/02-Apul-reference-annotation/Apul_GCF_013753865.1_rna-uniprot_blastx.tab
tr '|' '\t' < ../output/02-Apul-reference-annotation/Apul_GCF_013753865.1_rna-uniprot_blastx.tab \
> ../output/02-Apul-reference-annotation/Apul_GCF_013753865.1_rna-uniprot_blastx_sep.tab
head -1 ../output/02-Apul-reference-annotation/Apul_GCF_013753865.1_rna-uniprot_blastx_sep.tab
## XM_029323402.2 sp Q9XTR8 LIP1_CAEEL 31.321 265 157 5 578 1306 61 322 3.59e-25 111
bltabl <- read.csv("../output/02-Apul-reference-annotation/Apul_GCF_013753865.1_rna-uniprot_blastx_sep.tab", sep = '\t', header = FALSE)
spgo <- read.csv("https://gannet.fish.washington.edu/seashell/snaps/uniprot_table_r2023_01.tab", sep = '\t', header = TRUE)
datatable(head(bltabl), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE))
datatable(head(spgo), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE))
datatable(
left_join(bltabl, spgo, by = c("V3" = "Entry")) %>%
select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs)
# %>% mutate(V1 = str_replace_all(V1,pattern = "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed", replacement = "Ab"))
)