--- title: "11-Annoation" author: "Steven Roberts" date: "`r format(Sys.time(), '%d %B, %Y')`" output: github_document: toc: true toc_depth: 3 number_sections: true html_preview: true html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true editor_options: markdown: wrap: sentence --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) library(kableExtra) library(DT) library(Biostrings) library(tm) library(pheatmap) library(DESeq2) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center", # Align plots to the center comment = "" # Prevents appending '##' to beginning of lines in code output ) ``` ```{r, engine='bash', eval=TRUE} head ../output/10-hisat-deseq2/DEGlist.tab ``` ```{r, engine='bash', eval=TRUE} ls /home/shared/8TB_HDD_03/sr320/github/project-cod-temperature/data/ncbi_dataset/data/GCF_031168955.1 ``` ```{r, engine='bash', eval=TRUE} head /home/shared/8TB_HDD_03/sr320/github/project-cod-temperature/data/ncbi_dataset/data/GCF_031168955.1/* ``` ```{r, engine='bash', eval=TRUE} cat /home/shared/8TB_HDD_03/sr320/github/project-cod-temperature/data/ncbi_dataset/data/GCF_031168955.1/rna.fna | grep ">" | head -20 ``` ```{r, engine='bash', eval=TRUE} cat /home/shared/8TB_HDD_03/sr320/github/project-cod-temperature/data/ncbi_dataset/data/GCF_031168955.1/protein.faa | grep ">" | head -20 ``` ```{r, engine='bash', eval=TRUE} cat /home/shared/8TB_HDD_03/sr320/github/project-cod-temperature/data/ncbi_dataset/data/GCF_031168955.1/cds_from_genomic.fna | grep ">" | head -20 ``` Blast RNA fasta ```{r, engine='bash'} cd ../data curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz mv uniprot_sprot.fasta.gz uniprot_sprot_r2024_02.fasta.gz gunzip -k uniprot_sprot_r2024_02.fasta.gz ``` ```{r, engine='bash'} mkdir ../blastdb /home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \ -in ../data/uniprot_sprot_r2024_02.fasta \ -dbtype prot \ -out ../blastdb/uniprot_sprot_r2024_02 ``` ```{r, engine='bash'} /home/shared/ncbi-blast-2.15.0+/bin/blastx \ -query ../data/ncbi_dataset/data/GCF_031168955.1/rna.fna \ -db ../blastdb/uniprot_sprot_r2024_02 \ -out ../output/11-annotation/PcodRNA_uniprot_blastx.tab \ -evalue 1E-20 \ -num_threads 30 \ -max_target_seqs 1 \ -outfmt 6 ``` ```{r, engine='bash', eval=TRUE} head ../output/11-annotation/PcodRNA_uniprot_blastx.tab ``` ```{r, eval=TRUE} blast <- read.csv("../output/11-annotation/PcodRNA_uniprot_blastx.tab", sep = '\t', header = FALSE) ``` ```{r, eval=TRUE} head(blast) ``` ```{r, engine='bash', eval=TRUE} cat /home/shared/8TB_HDD_03/sr320/github/project-cod-temperature/data/ncbi_dataset/data/GCF_031168955.1/rna.fna | grep ">" | head -2 ``` ```{r, engine='bash'} perl -e '$count=0; $len=0; while(<>) {s/\r?\n//; s/\t/ /g; if (s/^>//) { if ($. != 1) {print "\n"} s/ |$/\t/; $count++; $_ .= "\t";} else {s/ //g; $len += length($_)} print $_;} print "\n"; warn "\nConverted $count FASTA records in $. lines to tabular format\nTotal sequence length: $len\n\n";' \ ../data/ncbi_dataset/data/GCF_031168955.1/rna.fna > ../output/11-annotation/GCF_031168955.1_rna.tab ``` ```{r, engine='bash', eval=TRUE} head ../output/11-annotation/GCF_031168955.1_rna.tab ``` ```{r, eval=TRUE} rnatab <- read.csv("../output/11-annotation/GCF_031168955.1_rna.tab", sep = '\t', header = FALSE, row.names=NULL) ``` ```{r, eval=TRUE} head(rnatab) ``` ```{r, eval=TRUE} library(stringr) library(dplyr) ``` ```{r, eval=TRUE} rnatabID <- rnatab %>% mutate(geneID = str_extract(V2, "\\(([^)]+)\\)(?=,)")) %>% mutate(geneID = str_replace_all(geneID, "\\(|\\)", "")) ``` ```{r} write.table(rnatabID, "../output/11-annotation/rnatabID.tab", sep = '\t', row.names = T) ``` ```{r, eval=TRUE} head(rnatabID) ``` ```{r, engine='bash'} /home/shared/ncbi-blast-2.15.0+/bin/blastx \ -query ../data/ncbi_dataset/data/GCF_031168955.1/cds_from_genomic.fna \ -db ../blastdb/uniprot_sprot_r2024_02 \ -out ../output/11-annotation/Pcod_cds_genomic_uniprot_blastx.tab \ -evalue 1E-20 \ -num_threads 46 \ -max_target_seqs 1 \ -outfmt 6 ``` ```{r, engine='bash', eval=true} head ../output/11-annotation/Pcod_cds_genomic_uniprot_blastx.tab ```