--- title: "12-deg_annot_from_hisat" author: "Megan Ewing" date: "2024-05-13" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ```{r} # do once # install.packages("R.utils") library(tidyverse) library(R.utils) library(dplyr) ``` ## Annotating our top DEGs using Blast and Uniprot Gene Ontology (GO) ids # Gene Level - No Pre Filter ```{r} #read in DEG results DEG <- read.csv("../output/0513-DEG.tab", sep = " ") head(DEG) ``` ```{r} # make column with LOC names LOCnames <- rownames(DEG) DEG$LOC <- LOCnames head(DEG) ``` ```{r} # read in BLAST results blast <- read.csv("~/clamgonads-macsamples/output/blast_out_gene_level_20240709.txt", sep="") head(blast) ``` ```{r} # make sure the column names are the same for the merge # not needed if you specify "by.x" and "by.y" in the merge function tho colnames(blast)[2] <- "LOC" head(blast) ``` ```{r} # join by gene name DEG_Blast <- left_join(x = DEG, y = blast, by = "LOC") head(DEG_Blast) write_delim(DEG_Blast, "../output/DEG_blast_cds_full_0709.tab") ``` ```{r} # make a short file of just the DEG names and other desired data DEG_annot <- data.frame( LOC = DEG_Blast$LOC, geneName = DEG_Blast$SPID, baseMean = DEG_Blast$baseMean, log2FoldChange = DEG_Blast$log2FoldChange, pvalue = DEG_Blast$pvalue, padj = DEG_Blast$padj, protein = DEG_Blast$protein, cds = DEG_Blast$cds ) head(DEG_annot) ``` ```{r} # retrieve gene names for uniprot id lookup # this is to get just the numbers from the geneName column for uniprot # so sp|S8FGV1|LAC... becomes just S8FGV1 uniprot_id <-DEG_annot$geneName[!is.na(DEG_annot$geneName)] head(uniprot_id) # write to table for uniprot import (or could just copy and paste but i imported a text file to uniprot just for my own santity to make sure everything was included) write.table(uniprot_id, "../output/uniprot_id_cds_genelevel_DEG_0709.txt", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) # and now for all the uniprot ids at the gene level that blast had write.table(blast$SPID, "../output/uniprot_id_cds_genelevel_FULL_0709.txt", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) ``` ```{r} # retrieved our unirpot id GO file from the web interface https://www.uniprot.org/ # importing and unzipping file here gunzip("../data/idmapping_2024_07_10.tsv.gz") GO_id <- read.csv("../data/idmapping_2024_07_10.tsv", sep = '\t', header = TRUE, row.names=NULL) head(GO_id) ``` ```{r} # join GO id info to our DEG_annot dataframe from earlier by the uniprot ids clam_GO_annotations <- merge(DEG_annot, GO_id, by.x = "geneName", by.y = "Entry") head(clam_GO_annotations) ``` ```{r} # siic it looks all good so let's write to file write.csv(clam_GO_annotations, "../output/clam_GO_annotations_cds_0709.csv") ``` ```{r} ## putting together a file that combines annotations, counts, and deseq results for all hits with p < 0.05 counts2 <- read_csv("../output/log_counts_all.csv") head(counts2) colnames(counts2)[1] <- "LOC" head(counts2) all_counts_annot <- left_join(x = counts2, y = clam_GO_annotations, by = "LOC") head(all_counts_annot) write.csv(all_counts_annot, "../output/counts_annot_all-0710.csv") ```