--- title: "06 more go" output: html_document date: --- new GO terms to try: Glycolysis (GO:0006096), Tricarboxylic Acid Cycle (GO:0006099), Oxidative phosphorylation (GO:0006119) # Glycolysis (GO:0006096) ```{bash} curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A0006096%29%29+AND+%28reviewed%3Atrue%29" -o ../output/06-moreGO/SwissProt-GO:0006096.fa ``` ```{bash} head ../output/06-moreGO/SwissProt-GO:0006096.fa grep -c ">" ../output/06-moreGO/SwissProt-GO:0006096.fa ``` Lets Pver as query and SwissProt-GO:0006096.fa is the database. ```{bash} /home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \ -in ../output/06-moreGO/SwissProt-GO:0006096.fa \ -dbtype prot \ -out ../output/06-moreGO/SwissProt-GO:0006096 ``` ```{bash} fasta="../data/Pver_proteins_names_v1.0.faa" /home/shared/ncbi-blast-2.15.0+/bin/blastp \ -query $fasta \ -db ../output/06-moreGO/SwissProt-GO:0006096 \ -out ../output/06-moreGO/Pver_blastp-GO:0006096_out.tab \ -evalue 1E-05 \ -num_threads 48 \ -max_target_seqs 1 \ -max_hsps 1 \ -outfmt 6 ``` ```{bash} head ../output/06-moreGO/Pver_blastp-GO:0006096_out.tab wc -l ../output/06-moreGO/Pver_blastp-GO:0006096_out.tab ``` --- Tricarboxylic Acid Cycle (GO:0006099) ```{bash} # Set the variable for the GO term GO_TERM="0006099" # Use the variable in the curl command curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A${GO_TERM}%29%29+AND+%28reviewed%3Atrue%29" -o "../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa" head ../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa grep -c ">" ../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa /home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \ -in ../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa \ -dbtype prot \ -out ../output/06-moreGO/SwissProt-GO:${GO_TERM} fasta="../data/Pver_proteins_names_v1.0.faa" /home/shared/ncbi-blast-2.15.0+/bin/blastp \ -query $fasta \ -db ../output/06-moreGO/SwissProt-GO:${GO_TERM} \ -out ../output/06-moreGO/Pver_blastp-GO:${GO_TERM}_out.tab \ -evalue 1E-05 \ -num_threads 48 \ -max_target_seqs 1 \ -max_hsps 1 \ -outfmt 6 \ 2>/dev/null head ../output/06-moreGO/Pver_blastp-GO:${GO_TERM}_out.tab wc -l ../output/06-moreGO/Pver_blastp-GO:${GO_TERM}_out.tab ``` # Oxidative phosphorylation (GO:0006119) ```{bash} # Set the variable for the GO term GO_TERM="0006119" # Use the variable in the curl command curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A${GO_TERM}%29%29+AND+%28reviewed%3Atrue%29" -o "../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa" head ../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa grep -c ">" ../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa /home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \ -in ../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa \ -dbtype prot \ -out ../output/06-moreGO/SwissProt-GO:${GO_TERM} fasta="../data/Pver_proteins_names_v1.0.faa" /home/shared/ncbi-blast-2.15.0+/bin/blastp \ -query $fasta \ -db ../output/06-moreGO/SwissProt-GO:${GO_TERM} \ -out ../output/06-moreGO/Pver_blastp-GO:${GO_TERM}_out.tab \ -evalue 1E-05 \ -num_threads 48 \ -max_target_seqs 1 \ -max_hsps 1 \ -outfmt 6 \ 2>/dev/null head ../output/06-moreGO/Pver_blastp-GO:${GO_TERM}_out.tab wc -l ../output/06-moreGO/Pver_blastp-GO:${GO_TERM}_out.tab ``` ## Vizualize in R using heatmaps ```{r} # Load necessary libraries library(readr) library(readxl) library(tidyverse) library(genefilter) #for pOverA filtering #install.packages("ComplexHeatmap") library(ComplexHeatmap) count_mat <- read_csv("../data/RNAseq/Poc_gene_count_matrix.csv") all_genes <- count_mat$gene_id #count_mat <- count_mat %>% select(-gene_id) #rownames(count_mat) <- all_genes # Clean up the column names colnames(count_mat) <- gsub("_R1.fastp-trim.20230215.fq.gz.sam.sorted.bam.merge.gtf", "", colnames(count_mat)) dim(count_mat) ``` ## pOverA filtering ```{r} ffun<-filterfun(pOverA(0.25,10)) #set up filtering parameters counts <- count_mat %>% select(-gene_id) count_mat_poa <- genefilter((counts), ffun) #apply filter sum(count_mat_poa) #count number of genes left count_mat_poa <- count_mat[count_mat_poa,] #keep only rows that passed filter count_mat <- count_mat_poa ``` The gene names in the count matrix and the blast dfs are not the same; this is due to naming errors made by the authors of the [Pverr genome paper](https://academic.oup.com/gbe/article/12/10/1911/5898631?login=false#supplementary-data). They made a supplementary file that has both naming iterations, so this will be used to make sure the gene names are the same in each df Read in file with gene name iterations ```{r} names <- read_excel("../data/RNAseq/FileS2_Pver_gene_annot_May28.xlsx", skip = 4) %>% select(Query, Gene) count_mat <- count_mat %>% full_join(names, by = c("gene_id" = "Gene")) ``` ## Glycolysis (GO:0006096) ```{r} go_term = "0006096" # Read the tab file for the specified GO term go_file_path <- paste0("../output/06-moreGO/Pver_blastp-GO:", go_term, "_out.tab") GO_data <- read_delim(go_file_path, delim = "\t", col_names = FALSE) colnames(GO_data) <- c("query_id", "subject_id", "percent_identity", "alignment_length", "mismatches", "gap_openings", "q_start", "q_end", "s_start", "s_end", "e_value", "bit_score") head(GO_data) dim(GO_data) count_GO <- GO_data %>% left_join(count_mat, by = c("query_id" = "Query")) %>% na.omit() # Check the filtered result head(count_GO) dim(count_GO) ``` ```{r} expression_data <- count_GO %>% select(-c(1:13)) # Adjust the column selection if necessary # Set row names to gene identifiers rownames(expression_data) <- count_GO$Gene # Convert expression data to a matrix expression_matrix <- as.matrix(expression_data) # Calculate Z-scores for the expression matrix z_score_matrix <- t(scale(t(expression_matrix))) # Replace NA/NaN with 0 z_score_matrix[is.nan(z_score_matrix)] = 0 ``` ### heatmap ```{r} # Create the heatmap using ComplexHeatmap Heatmap(z_score_matrix, na_col = "black", name = "Gene Count", # Name for the heatmap color bar row_title = paste("Genes in", go_term), # Title for the row side column_title = "Samples", # Title for the column side show_row_names = TRUE, # Show gene names show_column_names = TRUE, # Show sample names cluster_rows = TRUE, # Cluster rows cluster_columns = TRUE, # Cluster columns row_dend_reorder = TRUE, # Reorder dendrogram based on clustering column_dend_reorder = TRUE, column_names_gp = gpar(fontsize = 6), # Reorder dendrogram based on clustering heatmap_legend_param = list(title = "Gene Count")) # Create the heatmap using ComplexHeatmap Heatmap(z_score_matrix, na_col = "black", name = "Gene Count", # Name for the heatmap color bar row_title = paste("Genes in", go_term), # Title for the row side column_title = "Samples", # Title for the column side show_row_names = TRUE, # Show gene names show_column_names = TRUE, # Show sample names cluster_rows = TRUE, # Cluster rows cluster_columns = FALSE, # Cluster columns row_dend_reorder = TRUE, # Reorder dendrogram based on clustering column_dend_reorder = FALSE, column_names_gp = gpar(fontsize = 6), # Reorder dendrogram based on clustering heatmap_legend_param = list(title = "Gene Count")) ``` ## Tricarboxylic Acid Cycle (GO:0006099) ```{r} go_term = "0006099" # Read the tab file for the specified GO term go_file_path <- paste0("../output/06-moreGO/Pver_blastp-GO:", go_term, "_out.tab") GO_data <- read_delim(go_file_path, delim = "\t", col_names = FALSE) colnames(GO_data) <- c("query_id", "subject_id", "percent_identity", "alignment_length", "mismatches", "gap_openings", "q_start", "q_end", "s_start", "s_end", "e_value", "bit_score") head(GO_data) dim(GO_data) count_GO <- GO_data %>% left_join(count_mat, by = c("query_id" = "Query")) %>% na.omit() # Check the filtered result head(count_GO) dim(count_GO) ``` ```{r} expression_data <- count_GO %>% select(-c(1:13)) # Adjust the column selection if necessary # Set row names to gene identifiers rownames(expression_data) <- count_GO$Gene # Convert expression data to a matrix expression_matrix <- as.matrix(expression_data) # Calculate Z-scores for the expression matrix z_score_matrix <- t(scale(t(expression_matrix))) # Replace NA/NaN with 0 z_score_matrix[is.nan(z_score_matrix)] = 0 ``` ### heatmap ```{r} # Create the heatmap using ComplexHeatmap Heatmap(z_score_matrix, na_col = "black", name = "Gene Count", # Name for the heatmap color bar row_title = paste("Genes in", go_term), # Title for the row side column_title = "Samples", # Title for the column side show_row_names = TRUE, # Show gene names show_column_names = TRUE, # Show sample names cluster_rows = TRUE, # Cluster rows cluster_columns = TRUE, # Cluster columns row_dend_reorder = TRUE, # Reorder dendrogram based on clustering column_dend_reorder = TRUE, column_names_gp = gpar(fontsize = 6), # Reorder dendrogram based on clustering heatmap_legend_param = list(title = "Gene Count")) # Create the heatmap using ComplexHeatmap Heatmap(z_score_matrix, na_col = "black", name = "Gene Count", # Name for the heatmap color bar row_title = paste("Genes in", go_term), # Title for the row side column_title = "Samples", # Title for the column side show_row_names = TRUE, # Show gene names show_column_names = TRUE, # Show sample names cluster_rows = TRUE, # Cluster rows cluster_columns = FALSE, # Cluster columns row_dend_reorder = TRUE, # Reorder dendrogram based on clustering column_dend_reorder = FALSE, column_names_gp = gpar(fontsize = 6), # Reorder dendrogram based on clustering heatmap_legend_param = list(title = "Gene Count")) ``` ## Oxidative phosphorylation (GO:0006119) ```{r} go_term = "0006119" # Read the tab file for the specified GO term go_file_path <- paste0("../output/06-moreGO/Pver_blastp-GO:", go_term, "_out.tab") GO_data <- read_delim(go_file_path, delim = "\t", col_names = FALSE) colnames(GO_data) <- c("query_id", "subject_id", "percent_identity", "alignment_length", "mismatches", "gap_openings", "q_start", "q_end", "s_start", "s_end", "e_value", "bit_score") head(GO_data) dim(GO_data) count_GO <- GO_data %>% left_join(count_mat, by = c("query_id" = "Query")) %>% na.omit() # Check the filtered result head(count_GO) dim(count_GO) ``` ```{r} expression_data <- count_GO %>% select(-c(1:13)) # Adjust the column selection if necessary # Set row names to gene identifiers rownames(expression_data) <- count_GO$Gene # Convert expression data to a matrix expression_matrix <- as.matrix(expression_data) # Calculate Z-scores for the expression matrix z_score_matrix <- t(scale(t(expression_matrix))) # Replace NA/NaN with 0 z_score_matrix[is.nan(z_score_matrix)] = 0 ``` ### heatmap ```{r} # Create the heatmap using ComplexHeatmap Heatmap(z_score_matrix, na_col = "black", name = "Gene Count", # Name for the heatmap color bar row_title = paste("Genes in", go_term), # Title for the row side column_title = "Samples", # Title for the column side show_row_names = TRUE, # Show gene names show_column_names = TRUE, # Show sample names cluster_rows = TRUE, # Cluster rows cluster_columns = TRUE, # Cluster columns row_dend_reorder = TRUE, # Reorder dendrogram based on clustering column_dend_reorder = TRUE, column_names_gp = gpar(fontsize = 6), # Reorder dendrogram based on clustering heatmap_legend_param = list(title = "Gene Count")) # Create the heatmap using ComplexHeatmap Heatmap(z_score_matrix, na_col = "black", name = "Gene Count", # Name for the heatmap color bar row_title = paste("Genes in", go_term), # Title for the row side column_title = "Samples", # Title for the column side show_row_names = TRUE, # Show gene names show_column_names = TRUE, # Show sample names cluster_rows = TRUE, # Cluster rows cluster_columns = FALSE, # Cluster columns row_dend_reorder = TRUE, # Reorder dendrogram based on clustering column_dend_reorder = FALSE, column_names_gp = gpar(fontsize = 6), # Reorder dendrogram based on clustering heatmap_legend_param = list(title = "Gene Count")) ``` ![](http://gannet.fish.washington.edu/seashell/snaps/Monosnap_QuickGOTerm_GO0040029_2024-10-03_16-29-35.png) # Grabbing Epigenetic Regulators of Gene Expression GO:0040029 epigenetic regulation of gene expression ```{bash} # Set the variable for the GO term GO_TERM="0040029" # Use the variable in the curl command curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A${GO_TERM}%29%29+AND+%28reviewed%3Atrue%29" -o "../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa" head ../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa grep -c ">" ../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa /home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \ -in ../output/06-moreGO/SwissProt-GO:${GO_TERM}.fa \ -dbtype prot \ -out ../output/06-moreGO/SwissProt-GO:${GO_TERM} fasta="../data/Pver_proteins_names_v1.0.faa" /home/shared/ncbi-blast-2.15.0+/bin/blastp \ -query $fasta \ -db ../output/06-moreGO/SwissProt-GO:${GO_TERM} \ -out ../output/06-moreGO/Pver_blastp-GO:${GO_TERM}_out.tab \ -evalue 1E-05 \ -num_threads 48 \ -max_target_seqs 1 \ -max_hsps 1 \ -outfmt 6 \ 2>/dev/null head ../output/06-moreGO/Pver_blastp-GO:${GO_TERM}_out.tab wc -l ../output/06-moreGO/Pver_blastp-GO:${GO_TERM}_out.tab ```