--- title: "04-repro-annot" author: "Steven Roberts" date: "`r format(Sys.time(), '%d %B, %Y')`" analyses: github_document: toc: true toc_depth: 3 number_sections: true html_preview: true html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true editor_options: markdown: wrap: sentence --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) library(kableExtra) library(DT) library(Biostrings) library(tm) library(pheatmap) library(DESeq2) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center", # Align plots to the center comment = "" # Prevents appending '##' to beginning of lines in code analyses ) ``` Using Swiss-Prot Repro subset.. # Make BlastDB (Would only be needed once) ```{r, engine='bash'} curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A0022414%29%29+AND+%28reviewed%3Atrue%29" -o ../data/SwissProt-GO:0022414.fa ``` ```{r, engine='bash', eval=TRUE} head ../data/SwissProt-GO:0022414.fa echo "Number of Sequences" grep -c ">" ../data/SwissProt-GO:0022414.fa ``` ```{r, engine='bash'} /home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \ -in ../data/SwissProt-GO:0022414.fa \ -dbtype prot \ -out ../blastdb/SwissProt-GO:0022414 ``` # Set Query ## set fasta as variable ```{r, engine='bash', eval=TRUE} fasta="../data/PO2457_Ostrea_lurida.protein.fasta" head $fasta echo "Number of Sequences" grep ">" -c $fasta ``` # Blast ```{r, engine='bash'} fasta="../data/PO2457_Ostrea_lurida.protein.fasta" /home/shared/ncbi-blast-2.15.0+/bin/blastp \ -query $fasta \ -db ../blastdb/SwissProt-GO:0022414 \ -out ../output/04-repro-annot/blastout.tab \ -evalue 1E-20 \ -num_threads 48 \ -max_target_seqs 1 \ -max_hsps 1 \ -outfmt 6 ``` ```{r, engine='bash', eval=TRUE} head ../output/04-repro-annot/blastout.tab ``` ```{r, engine='bash', eval=TRUE} wc -l ../output/04-repro-annot/blastout.tab ``` ```{r, engine='bash', eval=TRUE} tr '|' '\t' < ../output/04-repro-annot/blastout.tab \ > ../output/04-repro-annot/blastout_sep.tab head -1 ../output/04-repro-annot/blastout_sep.tab ``` # Download Swiss-Prot Information ```{bash} curl -H "Accept: text/plain; format=tsv" "https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Cgo_p%2Cgo_c%2Cgo%2Cgo_f%2Cgo_id&format=tsv&query=%28%28go%3A0022414%29%29+AND+%28reviewed%3Atrue%29" -o ../data/SwissProt-GO:0022414.tsv ``` # Join blast with GO info ```{r read-data, eval=TRUE} bltabl <- read.csv("../output/04-repro-annot/blastout_sep.tab", sep = '\t', header = FALSE) spgo <- read.csv("../data/SwissProt-GO:0022414.tsv", sep = '\t', header = TRUE) ``` ```{r, eval=TRUE} annot_tab <- left_join(bltabl, spgo, by = c("V3" = "Entry")) %>% select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs) ``` ```{r, eval=TRUE} head(annot_tab) ```