Aidan Coyle,

Roberts Lab, UW-SAFS

Originally written 2021-02-16

This script takes a file of genes that are differentially expressed between two groups (obtained from DESeq2) and cross-references it with the transcriptome used to create your kallisto index. It then extracts the TPM (transcripts per million) counts from the kallisto libraries created earlier in the pipeline.

This produces a table containing the following:

First, loading the required libraries

library(tidyverse)

Then, we edit our variables. If running this script on other samples, you only need to edit the following section.

The following inputs are required

DEG_list: the filepath to your table of differentially-expressed genes blast_data: the filepath to the transcriptome. Should be the same transcriptome used to create the kallisto index outpath: the file you want to write the results to kallisto_path: the path to the folder containing all your kallisto libraries libraries: specify the libraries you want to add to your final matrix of TPM. Using the notation in Graceโ€™s hematodinium/bairdi libraries, the following notation applies (utilizes wildcards):

# Import DEG list outputted from DESeq2
DEG_list <- read.table("../graphs/DESeq2_output/cbaihemat_transcriptomev2.0/amb2_vs_elev2_indiv/DEGlist_wcols.txt",
                       header = TRUE,
                       sep = "\t")

blast_data <- read.delim("../data/cbai_hemat_diamond_blastx_table_transcriptome_v2.0.txt", 
                         header = FALSE,
                         sep = "\t")

# Specify file you want to write results to
outpath <- "../output/TPM_counts/cbaihemat_transcriptomev2.0/amb2_vs_elev2_DEG_TPMs.txt"

kallisto_path <- "../output/kallisto_libraries/cbaihemat_transcriptomev2.0/"

libraries <- "???"

We then cross-reference the DEG table with our BLAST data to create a two-column table of transcript IDs and gene IDs

# Transcript IDs are rownames - move them into first column
DEG_list <- rownames_to_column(DEG_list,
                               "Transcript_ID")

# Remove all columns that aren't transcript ID
DEG_list <- DEG_list[,1, drop = FALSE]

# Columns have no names - add names for first two columns
colnames(blast_data)[1:2] <- c("Transcript_ID", "Gene_ID")

# Turn the first two columns of BLAST data into a Transcript ID/Gene ID key
blast_data <- blast_data %>%
  select(Transcript_ID, Gene_ID)

# Add Gene ID column to transcript data, using Transcript ID column to match
DEG_list <- left_join(DEG_list, blast_data, by = "Transcript_ID")

# Select only DEGs with transcript IDs that match to genes
DEG_list <- DEG_list[!is.na(DEG_list$Gene_ID),]

head(DEG_list)
##                Transcript_ID               Gene_ID
## 1     TRINITY_DN978_c2_g1_i3 sp|O75179|ANR17_HUMAN
## 8   TRINITY_DN12263_c0_g1_i3 sp|Q66KH2|ERGI3_XENLA
## 11  TRINITY_DN12704_c0_g1_i1 sp|Q4V869|ACBD6_XENLA
## 17 TRINITY_DN26991_c0_g1_i17 sp|Q51732|DEGLY_PYRFU
## 20   TRINITY_DN400_c0_g1_i21 sp|Q96I24|FUBP3_HUMAN
## 23  TRINITY_DN6784_c0_g1_i10  sp|Q9CA23|UFM1_ARATH

We then extract the filenames for all kallisto files we want to match, and add a TPM column to our table for each kallisto file that we want to match

kallisto_filepath <- paste0(kallisto_path, "id", libraries, "/abundance.tsv")
# List all kallisto indices
kallisto_files <- Sys.glob(kallisto_filepath)



for (i in 1:length(kallisto_files)) {
  # Extract the ID number from the kallisto file
  idnum <- str_extract(kallisto_files[i], "id[0-9]+")
  # Read in the kallisto file
  kallisto_output <- read.delim(file = kallisto_files[i], 
                                header = TRUE,
                                sep = "\t")
  # Select only transcript ID and TPM (transcripts per million) columns
  kallisto_output <- kallisto_output %>%
    select(target_id, tpm)
  
  # Rename kallisto column names
  colnames(kallisto_output)[1:2] <- c("Transcript_ID", 
                                      paste0(idnum, "_TPM"))
  # Add TPM value to table of DEGs
  DEG_list <- left_join(DEG_list, kallisto_output, by = "Transcript_ID")
}

head(DEG_list)
##               Transcript_ID               Gene_ID id072_TPM id118_TPM id127_TPM
## 1    TRINITY_DN978_c2_g1_i3 sp|O75179|ANR17_HUMAN   0.00000 1.2028900  0.000000
## 2  TRINITY_DN12263_c0_g1_i3 sp|Q66KH2|ERGI3_XENLA   0.00000 0.3648890  0.000000
## 3  TRINITY_DN12704_c0_g1_i1 sp|Q4V869|ACBD6_XENLA   0.00000 0.0859281  0.000000
## 4 TRINITY_DN26991_c0_g1_i17 sp|Q51732|DEGLY_PYRFU   0.00000 0.6172780  0.000000
## 5   TRINITY_DN400_c0_g1_i21 sp|Q96I24|FUBP3_HUMAN   0.26481 0.6966450  0.936814
## 6  TRINITY_DN6784_c0_g1_i10  sp|Q9CA23|UFM1_ARATH   0.00000 0.8113120  0.000000
##   id132_TPM id151_TPM id173_TPM id178_TPM id254_TPM id272_TPM id280_TPM
## 1  0.850158         0         0  0.000000         0         0         0
## 2  0.000000         0         0  0.000000         0         0         0
## 3  0.853606         0         0  0.000000         0         0         0
## 4  1.571450         0         0  0.000000         0         0         0
## 5  0.000000         0         0  0.205008         0         0         0
## 6  0.640382         0         0  0.000000         0         0         0
##   id294_TPM id334_TPM id349_TPM id359_TPM   id445_TPM id463_TPM   id481_TPM
## 1         0   1.26075         0         0 0.00000e+00         0 0.000748794
## 2         0   1.55104         0         0 0.00000e+00         0 0.000000000
## 3         0   1.25870         0         0 0.00000e+00         0 0.000000000
## 4         0   2.87156         0         0 0.00000e+00         0 0.000000000
## 5         0   0.68939         0         0 1.25723e-06         0 0.000000000
## 6         0   2.04936         0         0 0.00000e+00         0 0.040730100
##   id485_TPM
## 1  0.561579
## 2  3.695670
## 3  1.211160
## 4  0.000000
## 5  0.000000
## 6  0.000000
# Write results to table
write.table(DEG_list,
            file = outpath,
            quote = FALSE,
            row.names = FALSE)