--- title: "04-Carbon-transport" output: html_document date: "2024-10-02" --- ![](http://gannet.fish.washington.edu/seashell/snaps/Monosnap_QuickGOTerm_GO0034219_2024-10-02_11-46-38.png) ```{bash} curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A0034219%29%29+AND+%28reviewed%3Atrue%29" -o ../output/04-carbon-transport/SwissProt-GO:0034219.fa ``` ```{bash} head ../output/04-carbon-transport/SwissProt-GO:0034219.fa grep -c ">" ../output/04-carbon-transport/SwissProt-GO:0034219.fa ``` Lets Pver as query and SwissProt-GO:0034219 is the database. ```{bash} /home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \ -in ../output/04-carbon-transport/SwissProt-GO:0034219.fa \ -dbtype prot \ -out ../output/04-carbon-transport/SwissProt-GO:0034219 ``` ```{bash} fasta="../data/Pver_proteins_names_v1.0.faa" /home/shared/ncbi-blast-2.15.0+/bin/blastp \ -query $fasta \ -db ../output/04-carbon-transport/SwissProt-GO:0034219 \ -out ../output/04-carbon-transport/Pver_blastp-GO:0034219_out.tab \ -evalue 1E-05 \ -num_threads 48 \ -max_target_seqs 1 \ -max_hsps 1 \ -outfmt 6 ``` ```{bash} head ../output/04-carbon-transport/Pver_blastp-GO:0034219_out.tab wc -l ../output/04-carbon-transport/Pver_blastp-GO:0034219_out.tab ``` ## Vizualize in R using heatmaps ```{r} # Load necessary libraries library(readr) library(readxl) library(tidyverse) #install.packages("ComplexHeatmap") library(ComplexHeatmap) count_mat <- read_csv("../data/RNAseq/Poc_gene_count_matrix.csv") all_genes <- count_mat$gene_id #count_mat <- count_mat %>% select(-gene_id) rownames(count_mat) <- all_genes # Clean up the column names colnames(count_mat) <- gsub("_R1.fastp-trim.20230215.fq.gz.sam.sorted.bam.merge.gtf", "", colnames(count_mat)) head(count_mat) dim(count_mat) ``` The gene names in the count matrix and the blast dfs are not the same; this is due to naming errors made by the authors of the [Pverr genome paper](https://academic.oup.com/gbe/article/12/10/1911/5898631?login=false#supplementary-data). They made a supplementary file that has both naming iterations, so this will be used to make sure the gene names are the same in each df Read in file with gene name iterations ```{r} names <- read_excel("../data/RNAseq/FileS2_Pver_gene_annot_May28.xlsx", skip = 4) %>% select(Query, Gene) count_mat <- count_mat %>% full_join(names, by = c("gene_id" = "Gene")) count_mat ``` ```{r} # Read the tab file and extract row names GO_0034219 <- read_delim("../output/04-carbon-transport/Pver_blastp-GO:0034219_out.tab", delim = "\t", col_names = FALSE) colnames(GO_0034219) <- c("query_id", "subject_id", "percent_identity", "alignment_length", "mismatches", "gap_openings", "q_start", "q_end", "s_start", "s_end", "e_value", "bit_score") head(GO_0034219) count_GO <- GO_0034219 %>% left_join(count_mat, by = c("query_id" = "Query" )) %>% na.omit() # Check the filtered result head(count_GO) dim(count_GO) colnames(count_GO) ``` ```{r} expression_data <- count_GO %>% select(-c(1:13)) # Adjust the column selection if necessary # Set row names to gene identifiers rownames(expression_data) <- count_GO$Gene # Convert expression data to a matrix expression_matrix <- as.matrix(expression_data) # Calculate Z-scores for the expression matrix z_score_matrix <- t(scale(t(expression_matrix))) # Replace NA/NaN with 0 z_score_matrix[is.nan(z_score_matrix)] = 0 # Create the heatmap using ComplexHeatmap Heatmap(z_score_matrix, na_col = "black", name = "Gene Count", # Name for the heatmap color bar row_title = "Genes in GO term", # Title for the row side column_title = "Samples", # Title for the column side show_row_names = TRUE, # Show gene names show_column_names = TRUE, # Show sample names cluster_rows = TRUE, # Cluster rows cluster_columns = TRUE, # Cluster columns row_dend_reorder = TRUE, # Reorder dendrogram based on clustering column_dend_reorder = TRUE, column_names_gp = gpar(fontsize = 6), # Reorder dendrogram based on clustering heatmap_legend_param = list(title = "Gene Count")) # Create the heatmap using ComplexHeatmap Heatmap(z_score_matrix, na_col = "black", name = "Gene Count", # Name for the heatmap color bar row_title = "Genes in GO term", # Title for the row side column_title = "Samples", # Title for the column side show_row_names = TRUE, # Show gene names show_column_names = TRUE, # Show sample names cluster_rows = TRUE, # Cluster rows cluster_columns = FALSE, # Cluster columns row_dend_reorder = TRUE, # Reorder dendrogram based on clustering column_dend_reorder = FALSE, column_names_gp = gpar(fontsize = 6), # Reorder dendrogram based on clustering heatmap_legend_param = list(title = "Gene Count")) ```