--- title: "orthogroup comparison" author: "Jill Ashey" date: "`r Sys.Date()`" output: html_document --- This script examines the orthogroups shared between the 3 species of interest and compares the orthogroups to the blast results that S. Roberts did. ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) library(tidyverse) library(UpSetR) library(grid) library(futile.logger) ``` ## Load and clean data ```{r} ortho <- read.table(file = '../data/orthogroups/Orthogroups.tsv', sep = '\t', header = TRUE, na.strings=c(""," ","NA")) colnames(ortho) <- c("Orthogroup", "Apul", "Pmea", "Peve") length(unique(ortho$Orthogroup)) # 22783 unique orthogroups head(ortho) ``` The dataframe has multiple protein IDs for each orthogroup, depending on the species. I need to split and unnest the 3 species columns so that each row has one orthogroup ID and one protein ID. ```{r} # Apulchra apul_ortho <- ortho %>% select(c("Orthogroup", "Apul")) %>% mutate(Apul = strsplit(as.character(Apul), ",")) %>% unnest(Apul) # Pmeandrina pmea_ortho <- ortho %>% select(c("Orthogroup", "Pmea")) %>% mutate(Pmea = strsplit(as.character(Pmea), ",")) %>% unnest(Pmea) # Pevermanni peve_ortho <- ortho %>% select(c("Orthogroup", "Peve")) %>% mutate(Peve = strsplit(as.character(Peve), ",")) %>% unnest(Peve) # Rejoin dataframes full <- full_join(apul_ortho, pmea_ortho, by = "Orthogroup") full <- full_join(full, peve_ortho, by = "Orthogroup") ``` Now we have a dataframe with a column for orthogroup and columns for each species. ## Visualize data Using a venn diagram, make a plot of the shared orthogroups ```{r} # Create a list of sets, one for each species set_list <- list( Apul = unique(full$Orthogroup[!is.na(full$Apul)]), Pmea = unique(full$Orthogroup[!is.na(full$Pmea)]), Peve = unique(full$Orthogroup[!is.na(full$Peve)]) ) # Define colors for each species set_colors <- c("Apul" = "orange", "Pmea" = "blue", "Peve" = "coral3") # Create a Venn diagram venn.plot <- venn.diagram( x = set_list, category.names = c("Apul", "Pmea", "Peve"), filename = "../output/orthogroups/Orthogroups_Venn.png", output = TRUE, fill = set_colors, # Specify fill colors alpha = 0.5 # Adjust transparency (alpha) ) ``` ## Merge with blast data First, read in small RNA and lncRNA blast table and compute metrics ```{r} getwd() ### LncRNA and smRNA comparison # e-value of 1E-08 smol <- read.table("../../E-Peve/output/02-Peve-lncRNA-align/lncRNA_sRNA_blastn03.tab") colnames(smol) <- c("qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore") ## Column names # qseqid - query gene id (lncRNAs in this case) # sseqid - subject or target sequence id (smRNAs in this case) # pident - percentage of identical positions # length - alignment overlap/sequence overlap # mismatch - number of mismatches # gapopen - number of gap openings # qstart - start of alignment in query # qend - end of alignment in query # sstart - start of alignment in subject # send - end of alignment in subject # evalue - expect value # bitscore - bit score # Remove :: in gene names smol$qseqid <- gsub("::", "", smol$qseqid) # How many lncRNAs are there? How many smRNAs? length(unique(smol$qseqid)) # 2952 lncRNAs length(unique(smol$sseqid)) # 14306 smRNAs # All of the matches have a pident of >97% # Remove all columns exceot qseqid and sseqid smol <- smol %>% select(qseqid, sseqid) # Remove duplicate rows smol <- unique(smol) ``` First, read in protein and lncRNA blast table and compute metrics ```{r} ### LncRNA and protein comparison # e-value of 1E-40 prot <- read.table("../../E-Peve/output/02-Peve-lncRNA-align/lncRNA_proteome_blastx02.tab") colnames(prot) <- c("qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore") # Remove :: in gene names prot$qseqid <- gsub("::", "", prot$qseqid) # How many lncRNAs are there? How many proteins? length(unique(prot$qseqid)) # 699 lncRNAs length(unique(prot$sseqid)) # 1282 proteins # Look at mean, min, max and median of pident mean(prot$pident) min(prot$pident) max(prot$pident) median(prot$pident) # Remove all columns exceot qseqid and sseqid prot <- prot %>% select(qseqid, sseqid) # Remove duplicate rows prot <- unique(prot) ``` Merge the two blast datasets ```{r} # Merge datasets all <- merge(smol, prot, by = "qseqid") # using merge here, but could switch to full_join. Merge is more strict head(all) # Rename columns colnames(all) <- c("lncRNA", "smRNA", "Peve") # renameing protein column 'Peve' so that I can join this df with the orthogroup df # How many lncRNAs are there? How many smRNAs, proteins? length(unique(all$lncRNA)) # 382 lncRNAs length(unique(all$smRNA)) # 1890 smRNAs length(unique(all$Peve)) # 719 proteins ``` Merge the blast datasets and the orthogroup info ```{r} ortho_blast <- full_join(all, full, by = "Peve") head(ortho_blast) ``` Now we have a dataframe that includes orthogroups and the blast results for the Peve. This only includes lncRNAs, smRNAs and protein sequences that were shared. Eg it does not include the blast results for lncRNA and smRNA if there was no associated protein sequence. ## Merge with Pmea annotations I'm not sure if we have annotations for any of the coral species yet, but there are KEGG annotations for Pmea. I'm going to merge the ortho_blast dataframe with that dataset. Maybe it'll give us an idea of what functions the lncRNAs and smRNAs could be involved in. First, read in Pmea annotations ```{r} pmea_annot <- read.csv('../data/Pocillopora_meandrina_HIv1.genes.EggNog_results.csv', sep = '\t', header = TRUE, na.strings=c(""," ","NA")) # rename first column so it can be merged properly colnames(pmea_annot)[1] <- "Pmea" ``` Merge annotations with ortho_blast dataframe ```{r} ortho_blast_annot <- full_join(pmea_annot, ortho_blast, by = "Pmea") head(ortho_blast_annot) ``` Let's remove the excess columns so we can have a better look at it. For now, I am just going to keep the Description column, but we can always add the GO, KEGG, etc columns back in at a later date. ```{r} colnames(ortho_blast_annot) # Select only specific columns ortho_blast_annot <- ortho_blast_annot %>% select(Pmea, Description, lncRNA, smRNA, Peve, Orthogroup, Apul) %>% na.omit(lncRNA) # remove all rows that have NAs in the lncRNA column # Count the number of times specific descriptions appear in the dataset description_counts <- ortho_blast_annot %>% group_by(Description) %>% summarize(Count = n()) ``` Looking at the description_counts, it looks like the functions among all this shared data are: - protein serine/threonine/tyrosine kinase activity - copper ion import across plasma membrane - Belongs to the heat shock protein 70 family This is just the 3 that had the highest number of counts. Let's do a quick inventory of how many lncRNAs, proteins, smRNAs, etc that we have left after all that filtering and merging. ```{r} length(unique(ortho_blast_annot$Pmea)) # 70 Pmea protein sequences length(unique(ortho_blast_annot$Description)) # 68 descriptions of potential functions length(unique(ortho_blast_annot$lncRNA)) # 84 Peve lncRNA sequences length(unique(ortho_blast_annot$smRNA)) # 288 small RNA sequences length(unique(ortho_blast_annot$Peve)) # 70 Peve protein sequences length(unique(ortho_blast_annot$Orthogroup)) # 70 orthogroups length(unique(ortho_blast_annot$Apul)) # 119 Apul protein sequences ```