--- title: "01-repro-annot" author: "Steven Roberts" date: "`r format(Sys.time(), '%d %B, %Y')`" analyses: github_document: toc: true toc_depth: 3 number_sections: true html_preview: true html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true editor_options: markdown: wrap: sentence --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) library(kableExtra) library(DT) library(Biostrings) library(tm) library(pheatmap) library(DESeq2) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center", # Align plots to the center comment = "" # Prevents appending '##' to beginning of lines in code analyses ) ``` Lets take proteins and characterize all those involved in reproduction ```{r, engine='bash', eval=TRUE} head ../data/PO2457_Ostrea_lurida.protein.fasta ``` ```{r, engine='bash', eval=TRUE} grep ">" -c ../data/PO2457_Ostrea_lurida.protein.fasta ``` # Make BlastDB ```{r, engine='bash'} cd ../data curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz mv uniprot_sprot.fasta.gz uniprot_sprot_r2024_03.fasta.gz gunzip -k uniprot_sprot_r2024_03.fasta.gz ``` ```{r, engine='bash'} mkdir ../blastdb /home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \ -in ../data/uniprot_sprot_r2024_03.fasta \ -dbtype prot \ -out ../blastdb/uniprot_sprot_r2024_03 ``` # Blast ```{r, engine='bash'} /home/shared/ncbi-blast-2.11.0+/bin/blastp \ -query ../data/PO2457_Ostrea_lurida.protein.fasta \ -db ../blastdb/uniprot_sprot_r2024_03 \ -out ../output/01-repro-annot/Olur-uniprot_blastp.tab \ -evalue 1E-20 \ -num_threads 30 \ -max_target_seqs 1 \ -max_hsps 1 \ -outfmt 6 ``` ```{r, engine='bash', eval=TRUE} head ../output/01-repro-annot/Olur-uniprot_blastp.tab ``` ```{r, engine='bash', eval=TRUE} wc -l ../output/01-repro-annot/Olur-uniprot_blastp.tab ``` ```{r, engine='bash'} /home/shared/ncbi-blast-2.11.0+/bin/blastp \ -query ../data/PO2457_Ostrea_lurida.protein.fasta \ -db ../blastdb/uniprot_sprot_r2024_03 \ -out ../output/01-repro-annot/Olur-uniprot_blastp_e10.tab \ -evalue 1E-10 \ -num_threads 30 \ -max_target_seqs 1 \ -max_hsps 1 \ -outfmt 6 ``` ```{r, engine='bash', eval=TRUE} head ../output/01-repro-annot/Olur-uniprot_blastp_e10.tab ``` ```{r, engine='bash', eval=TRUE} wc -l ../output/01-repro-annot/Olur-uniprot_blastp_10.tab ``` ```{r, engine='bash', eval=TRUE} tr '|' '\t' < ../output/01-repro-annot/Olur-uniprot_blastp.tab \ > ../output/01-repro-annot/Olur-uniprot_blastp_sep.tab head -1 ../output/01-repro-annot/Olur-uniprot_blastp_sep.tab ``` # Download SP GO INFORMATION ```{bash} curl -H "Accept: text/plain; format=tsv" "https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Cgo_p%2Cgo_c%2Cgo%2Cgo_f%2Cgo_id&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29" -o ../data/SPannot-GO.tsv ``` bltabl <- read.csv("output/01-repro-annot/Olur-uniprot_blastp_sep.tab", sep = '\t', header = FALSE) spgo <- read.csv("data/uniprot_table_go_r2024_03.tab", sep = '\t', header = TRUE) ```{r read-data} bltabl <- read.csv("../output/01-repro-annot/Olur-uniprot_blastp_sep.tab", sep = '\t', header = FALSE) spgo <- read.csv("../data/uniprot_table_go_r2024_03.tab", sep = '\t', header = TRUE) ``` ```{r} datatable(head(bltabl), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE)) ``` ```{r spgo-table} datatable(head(spgo), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE)) ``` Annot <- left_join(bltabl, spgo, by = c("V3" = "Entry")) %>% select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs) ```{r see} datatable( left_join(bltabl, spgo, by = c("V3" = "Entry")) %>% select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs) %>% mutate(V1 = str_replace_all(V1, pattern = "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed", replacement = "Ab")) ) ``` ```{r} #library(dplyr) #library(AnnotationDbi) #library(GO.db) # Function to check if any term in a list is a descendant of 'GO:0019953 reproduction' check_reproduction_descendant <- function(go_terms) { reproduction <- "GO:0019953" terms <- unlist(strsplit(go_terms, "; ")) # Split the terms ancestor_list <- lapply(terms, function(term) { tryCatch({ ans <- select(GO.db, keys = term, columns = "ANCESTOR", keytype = "GOID") as.character(ans$ANCESTOR) }, error = function(e) { return(NULL) }) # Handle missing terms gracefully }) any(sapply(ancestor_list, function(x) reproduction %in% x)) } # Apply the function to each cell in the dataframe Annot2 <- Annot %>% mutate(Is_Reproduction = sapply(Gene.Ontology.IDs, check_reproduction_descendant)) # Output the updated dataframe print(Annot2) ``` GO:0019953 - sexual reproduction ```{r} # Function to check if any term in a list is a descendant of 'GO:0019953 reproduction' check_reproduction_descendant <- function(go_terms) { reproduction <- "GO:0019953" terms <- unlist(strsplit(go_terms, ";")) # Split the terms ancestor_list <- lapply(terms, function(term) { tryCatch({ ans <- select(GO.db, keys = term, columns = "ANCESTOR", keytype = "GOID") as.character(ans$ANCESTOR) }, error = function(e) { cat("Error with term:", term, "\nError Message:", e$message, "\n") return(NULL) # Handle missing terms gracefully }) }) result <- any(sapply(ancestor_list, function(x) reproduction %in% x)) cat("Terms:", terms, "Result:", result, "\n") # Debug output return(result) } # Apply the function to each cell in the dataframe Annot2 <- Annot %>% dplyr::mutate(Is_Reproduction = sapply(Gene.Ontology.IDs, check_reproduction_descendant)) # Output the updated dataframe print(Annot2) ``` 333 ```{r} library(dplyr) library(GO.db) # Function to check if a term is a descendant of 'GO:0000003 reproduction' check_reproduction_descendant <- function(go_term) { reproduction <- "GO:0000003" ancestors <- as.character(GOTERM[go_term]@ancestors) return(reproduction %in% ancestors) } ``` ```{r} # Function to check if any term in a list is a descendant of 'GO:0000003 reproduction' check_reproduction_descendant <- function(go_terms) { reproduction <- "GO:0000003" terms <- unlist(strsplit(go_terms, ";")) # Split the terms # Check each term and return TRUE if any term is a descendant any(sapply(terms, function(term) reproduction %in% as.character(GOTERM[[term]]@ancestors))) } # Apply the function to each cell in the dataframe Annot2 <- Annot %>% mutate(Is_Reproduction = sapply(Gene.Ontology.IDs, check_reproduction_descendant)) # Output the updated dataframe print(Annot2) ``` ```{r} # Apply the function to each GO term in the dataframe Annot2 <- Annot %>% mutate(Is_Reproduction = sapply(Gene.Ontology.IDs, check_reproduction_descendant)) # Output the updated dataframe print(Annot2) ``` # Apply the function to each GO term in the dataframe data <- data %>% mutate(Is_Reproduction = sapply(GO_Term, check_reproduction_descendant)) # Output the updated dataframe print(data) ``` 333 ```{r} marts <- listMarts() print(marts) ``` ```{r} ensembl <- useDataset("cintestinalis_gene_ensembl", mart = ensembl) ``` ```{r} ensembl <- useMart("uniprot", dataset = "uniprot") ``` ```{r} gene2go <- getBM( attributes = c("uniprotswissprot", "external_gene_name", "go_id", "namespace_1003"), filters = "uniprotswissprot", values = genes, mart = ensembl ) print(gene2go) ``` ```{r} gene2go <- getBM( attributes = c("uniprotswissprot", "external_gene_name", "go_id", "namespace_1003"), filters = "uniprotswissprot", values = genes, mart = ensembl ) ``` ```{r join} annot_tab <- left_join(bltabl, spgo, by = c("V3" = "Entry")) %>% select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs) %>% mutate(V1 = str_replace_all(V1, pattern = "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed", replacement = "Ab")) ``` if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install(c("clusterProfiler", "GO.db", "org.Hs.eg.db", "dplyr", "GOSemSim")) library(clusterProfiler) library(GO.db) library(org.Hs.eg.db) library(dplyr) library(GOSemSim) ```{r} # Read dataset dataset <- read.csv("../output/blast_annot_go.tab", sep = '\t') # Replace with the path to your dataset # Select the column of interest column_name <- "Organism" # Replace with the name of the column of interest column_data <- dataset[[column_name]] # Count the occurrences of the strings in the column string_counts <- table(column_data) # Convert to a data frame, sort by count, and select the top 10 string_counts_df <- as.data.frame(string_counts) colnames(string_counts_df) <- c("String", "Count") string_counts_df <- string_counts_df[order(string_counts_df$Count, decreasing = TRUE), ] top_10_strings <- head(string_counts_df, n = 10)