--- title: "12-deg_annot_from_hisat" author: "Megan Ewing" date: "2024-05-13" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ```{r} # do once # install.packages("R.utils") library(tidyverse) library(R.utils) library(dplyr) ``` ## Annotating our top DEGs using Blast and Uniprot Gene Ontology (GO) ids ```{r} #read in DEG results DEG <- read.csv("../output/0513-DEG.tab", sep = " ") head(DEG) ``` ```{r} # make column with LOC names LOCnames <- rownames(DEG) DEG$LOC <- LOCnames head(DEG) ``` ```{r} # read in BLAST results blast <- read.delim("../output/rphil_blast_cds.tab", sep = "") head(blast) ``` ```{r} # make sure the column names are the same for the merge # not needed if you specify "by.x" and "by.y" in the merge function tho colnames(blast)[2] <- "LOC" head(blast) ``` ```{r} # join by gene name DEG_Blast <- left_join(x = DEG, y = blast, by = "LOC") head(DEG_Blast) write_delim(DEG_Blast, "../output/DEG_blast_cds_full.tab") ``` ```{r} # make a short file of just the DEG names and other desired data DEG_annot <- data.frame( LOC = DEG_Blast$LOC, geneName = DEG_Blast$saccver, baseMean = DEG_Blast$baseMean, log2FoldChange = DEG_Blast$log2FoldChange, pvalue = DEG_Blast$pvalue, padj = DEG_Blast$padj, protein = DEG_Blast$protein, cds = DEG_Blast$cds ) head(DEG_annot) ``` ```{r} # retrieve gene names for uniprot id lookup # this is to get just the numbers from the geneName column for uniprot # so sp|S8FGV1|LAC... becomes just S8FGV1 uniprot_id <- substr(DEG_annot$geneName, 4, 9) head(uniprot_id) # add this list of uniprot id names to the dataframe DEG_annot$uniprot_id <- uniprot_id head(DEG_annot) # write to table for uniprot import (or could just copy and paste but i imported a text file to uniprot just for my own santity to make sure everything was included) write.table(uniprot_id, "../output/uniprot_id_cds.txt", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) uniprot_id_no_NA <- na.omit(uniprot_id) write.table(uniprot_id_no_NA, "../output/uniprot_id_cds_no_NA.txt", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) ``` ```{r} # retrieved our unirpot id GO file from the web interface https://www.uniprot.org/ # importing and unzipping file here gunzip("../data/idmapping_2024_05_14.tsv.gz") GO_id <- read.csv("../data/idmapping_2024_05_14.tsv", sep = '\t', header = TRUE, row.names=NULL) head(GO_id) ``` ```{r} # join GO id info to our DEG_annot dataframe from earlier by the uniprot ids clam_GO_annotations <- merge(DEG_annot, GO_id, by.x = "uniprot_id", by.y = "Entry") head(clam_GO_annotations) ``` ```{r} # siic it looks all good so let's write to file write.csv(clam_GO_annotations, "../output/clam_GO_annotations_cds_0514.csv") ``` ```{r} # getting a full list of *all* of the accession numbers for gene enrichment # this is to get just the numbers from the geneName column for uniprot # so sp|S8FGV1|LAC... becomes just S8FGV1 uniprot_id_full <- substr(rphil_blast_cds$saccver, 4, 9) head(uniprot_id_full) # write to table for uniprot import (or could just copy and paste but i imported a text file to uniprot just for my own santity to make sure everything was included) write.table(uniprot_id_full, "../output/uniprot_id_cds_full.txt", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) ```