--- title: "R Notebook" author: "Megan Ewing" date: "2024-11-22" output: html_document: default word_document: default --- ```{r} # load installed packages library(dplyr) library(tidyr) ``` ### read in DEG and BLAST data ```{r} stats <- read.csv("../output/1020-STAR-DEGstats_ToC_8ind27r.tab", sep="") head(stats) stats$LOC <- row.names(stats) head(stats) blast <- read.delim("../output/1125-blastout.tab") head(blast) ``` ### gene level select best bitscore hit for each gene to get gene level blast results (as opposed to isoform level) ```{r} # Create the gene-level dataframe blast_gene <- blast %>% group_by(LOC) %>% # Group by gene ID. dplyr::slice(which.min(Evalue)) %>% # Select the row with the smallest Evalue for each gene. separate(Subject_ID, sep="\\|", into=c("na", "SPID", "gene.Uni"), remove = F) %>% # Split 'Subject_ID' into 'na', 'SPID', and 'gene.Uni' using "|" as a separator. dplyr::select(-na) %>% # Remove the temporary 'na' column. separate(gene.Uni, sep="_", into=c("gene.Uni", "species"), remove = T) # Split 'gene.Uni' into 'gene.Uni' and 'species' using "_" as a separator. head(blast_gene) ``` ### join deg info to blast info by LOC ```{r} DEG_annot <- left_join(stats, blast_gene, by = "LOC") head(DEG_annot) row.names(DEG_annot) <- DEG_annot$LOC head(DEG_annot) ``` ### getting uniprot info first need to extract SwissProt IDs ```{r} SPID <- DEG_annot$SPID head(SPID) write(SPID, "../output/1126-DEG_SPID_star.txt") ``` went to uniprot website, mapped SPIDs, saved as tsv and unzipped. importing here. ```{r} uniprot <- read_delim("../output/idmapping_2024_11_27.tsv") head(uniprot) colnames(uniprot)[colnames(uniprot) == "From"] <- "SPID" head(uniprot) ``` ### join blast and uniprot info ```{r} DEG_annot2 <- left_join(DEG_annot, uniprot, by = "SPID") head(DEG_annot2) write_csv(DEG_annot2, "../output/1126-DEG_annot_star.csv", col_names = T) ```