--- title: "Pver Annotation" author: "Steven Roberts" date: "`r format(Sys.time(), '%d %B, %Y')`" output: github_document: toc: true toc_depth: 3 number_sections: true html_preview: true html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true editor_options: markdown: wrap: sentence --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) library(kableExtra) library(DT) library(Biostrings) library(tm) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center", # Align plots to the center comment = "" # Prevents appending '##' to beginning of lines in code output ) ``` Pver # Protein ```{bash} cd ../data curl -O http://pver.reefgenomics.org/download/Pver_proteins_names_v1.0.faa.gz ``` ```{bash} gunzip -c ../data/Pver_proteins_names_v1.0.faa.gz > ../data/Pver_proteins_names_v1.0.faa ``` ```{bash} head ../data/Pver_proteins_names_v1.0.faa ``` ```{bash} ls ../data/blast_dbs ``` ```{bash} cd ../data/blast_dbs curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz mv uniprot_sprot.fasta.gz uniprot_sprot_r2024_04.fasta.gz gunzip -k uniprot_sprot_r2024_05.fasta.gz ``` ```{bash} head ../../data/blast_dbs/uniprot_sprot_r2024_04.fasta echo "Number of Sequences" grep -c ">" ../../data/blast_dbs/uniprot_sprot_r2024_04.fasta ``` ```{bash} /home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \ -in ../../data/blast_dbs/uniprot_sprot_r2024_04.fasta \ -dbtype prot \ -out ../../data/blast_dbs/uniprot_sprot_r2024_04 ``` --- ```{bash} /home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \ -in ../data/Pver_proteins_names_v1.0.faa \ -dbtype prot \ -out ../output/02-Pver-blast/Pver_proteins_names_v1.0 ``` ```{bash} fasta="../data/Machinery.fasta" /home/shared/ncbi-blast-2.15.0+/bin/blastp \ -query $fasta \ -db ../output/02-Pver-blast/Pver_proteins_names_v1.0 \ -out ../output/02-Pver-blast/Mach-blastp-Pver_out.tab \ -evalue 1E-05 \ -num_threads 48 \ -max_target_seqs 1 \ -max_hsps 1 \ -outfmt 6 ``` ```{r, engine='bash'} wc -l ../output/20-Apul-gene-annotation/blastp_out.tab ``` ```{r, engine='bash'} tr '|' '\t' < ../output/20-Apul-gene-annotation/blastp_out.tab \ > ../output/20-Apul-gene-annotation/blastp_out_sep.tab head -1 ../output/20-Apul-gene-annotation/blastp_out_sep.tab ``` ![](http://gannet.fish.washington.edu/seashell/snaps/Monosnap__in_UniProtKB_search_571864__UniProt_2024-08-23_16-18-03.png) # **Download Swiss-Prot Information** ``` sr320@raven:/home/shared/8TB_HDD_01/sr320/github/deep-dive/data$ curl -H "Accept: text/plain; format=tsv" "https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Cgo_p%2Cgo_c%2Cgo%2Cgo_f%2Cgo_id&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29" -o SwissProt-Annot-GO_082324.tsv ``` ```{bash} wc -l ../../data/SwissProt-Annot-GO_082324.tsv ``` # **Join blast with GO info** ```{r} bltabl <- read.csv("../output/20-Apul-gene-annotation/blastp_out_sep.tab", sep = '\t', header = FALSE) spgo <- read.csv("../../data/SwissProt-Annot-GO_082324.tsv", sep = '\t', header = TRUE) ``` ```{r} annot_tab <- left_join(bltabl, spgo, by = c("V3" = "Entry")) %>% select( query = V1, blast_hit = V3, evalue = V13, ProteinNames = Protein.names, BiologicalProcess = Gene.Ontology..biological.process., GeneOntologyIDs = Gene.Ontology.IDs ) ``` ```{r} head(annot_tab) ``` ```{r} write.table(annot_tab, file = "../output/20-Apul-gene-annotation/Apul-protein-GO.tsv", sep = "\t", row.names = FALSE, quote = FALSE) ``` ```{bash} head ../output/20-Apul-gene-annotation/Apul-protein-GO.tsv ``` # RNAs https://gannet.fish.washington.edu/acropora/E5-deep-dive/Transcripts/Apul_GCF_013753865.1_rna.fna ```{bash} cd ../data curl -O https://gannet.fish.washington.edu/acropora/E5-deep-dive/Transcripts/Apul_GCF_013753865.1_rna.fna ``` ```{bash} head ../data/Apul_GCF_013753865.1_rna.fna ``` ```{bash} fasta="../data/Apul_GCF_013753865.1_rna.fna" /home/shared/ncbi-blast-2.15.0+/bin/blastx \ -query $fasta \ -db ../../data/blast_dbs/uniprot_sprot_r2024_04 \ -out ../output/20-Apul-gene-annotation/blastx_out.tab \ -evalue 1E-05 \ -num_threads 48 \ -max_target_seqs 1 \ -max_hsps 1 \ -outfmt 6 ``` ```{r, engine='bash'} wc -l ../output/20-Apul-gene-annotation/blastx_out.tab ``` ```{r, engine='bash'} tr '|' '\t' < ../output/20-Apul-gene-annotation/blastx_out.tab \ > ../output/20-Apul-gene-annotation/blastx_out_sep.tab head -1 ../output/20-Apul-gene-annotation/blastx_out_sep.tab ``` ![](http://gannet.fish.washington.edu/seashell/snaps/Monosnap__in_UniProtKB_search_571864__UniProt_2024-08-23_16-18-03.png) # **Download Swiss-Prot Information** ``` sr320@raven:/home/shared/8TB_HDD_01/sr320/github/deep-dive/data$ curl -H "Accept: text/plain; format=tsv" "https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Cgo_p%2Cgo_c%2Cgo%2Cgo_f%2Cgo_id&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29" -o SwissProt-Annot-GO_082324.tsv ``` ```{bash} wc -l ../../data/SwissProt-Annot-GO_082324.tsv ``` # **Join blast with GO info** ```{r} bltabl <- read.csv("../output/20-Apul-gene-annotation/blastx_out_sep.tab", sep = '\t', header = FALSE) spgo <- read.csv("../../data/SwissProt-Annot-GO_082324.tsv", sep = '\t', header = TRUE) ``` ```{r} annot_tab <- left_join(bltabl, spgo, by = c("V3" = "Entry")) %>% select( query = V1, blast_hit = V3, evalue = V13, ProteinNames = Protein.names, BiologicalProcess = Gene.Ontology..biological.process., GeneOntologyIDs = Gene.Ontology.IDs ) ``` ```{r} head(annot_tab) ``` ```{r} write.table(annot_tab, file = "../output/20-Apul-gene-annotation/Apul-rna-GO.tsv", sep = "\t", row.names = FALSE, quote = FALSE) ``` ```{bash} head ../output/20-Apul-gene-annotation/Apul-rna-GO.tsv ```