We need to generate a databse that associates gene IDs with Uniprot Acession numbers and GO terms, for use with our gene-level count matrices generated through Hisat2/featureCounts. We already have a transcriptome database (generated in 03-transcriptome-annotation), and the transcriptome has annotated headers that include gene IDs. We can use these headers to add gene IDs to our existing database.

First, load our existing transcriptome database, which associates transcript IDs with Uniprot Acession numbers and GO terms (generated in 03-transcriptome-annotation)

# Load our transcript ID -> UniprotID/GO term database
rna_id <- read.csv("../output/03-transcriptome-annotation/G_macrocephalus_rna_IDmapping_2024_04_17.tab", sep="\t")

Now let’s view the header format from our original transcriptome file

grep "^>" ../data/GCF_031168955.1_ASM3116895v1_rna.fna | head -5
## >XM_060035408.1 PREDICTED: Gadus macrocephalus putative helicase MOV-10 (LOC132473465), mRNA
## >XM_060035409.1 PREDICTED: Gadus macrocephalus uncharacterized LOC132445431 (LOC132445431), transcript variant X2, mRNA
## >XM_060035410.1 PREDICTED: Gadus macrocephalus uncharacterized LOC132445431 (LOC132445431), transcript variant X3, mRNA
## >XM_060035411.1 PREDICTED: Gadus macrocephalus uncharacterized LOC132445431 (LOC132445431), transcript variant X4, mRNA
## >XM_060035412.1 PREDICTED: Gadus macrocephalus ATP-sensitive inward rectifier potassium channel 12-like (LOC132475957), transcript variant X2, mRNA

We need to extract three pieces of information from these headers: the transcript ID (e.g., “XM_060035408.1”), the associated gene ID (e.g., “LOC132473465”), and the type (e.g., “mRNA”).

grep "^>" ../data/GCF_031168955.1_ASM3116895v1_rna.fna | \
while read -r line; do
    # Extract the sequence ID
    seq_id=$(echo "$line" | awk -F'>| ' '{print $2}')
    # Extract the gene ID
    gene_ID=$(echo "$line" | grep -oP '\([^()]*\)(?=,[^()]*$)' | grep -oP '(?<=\().*(?=\))')
    # Extract the sequence type
    seq_type=$(echo "$line" | awk -F'), ' '{print $NF}')
    # Print the desired tab-delimited output format
    echo -e "${seq_id}\t${gene_ID}\t${seq_type}"
done > ../output/03.2-genome-annotation/transcriptome_headers_formatted.txt
# View some formatted headers
head -5 ../output/03.2-genome-annotation/transcriptome_headers_formatted.txt

echo ""
echo ""

# Check that we have the same number of formatted header lines as original header lines
echo "# headers in original transcriptome file:"
grep "^>" ../data/GCF_031168955.1_ASM3116895v1_rna.fna | wc -l
echo""
echo "# headers in formatted file:"
wc -l < ../output/03.2-genome-annotation/transcriptome_headers_formatted.txt
## XM_060035408.1   LOC132473465    mRNA
## XM_060035409.1   LOC132445431    transcript variant X2, mRNA
## XM_060035410.1   LOC132445431    transcript variant X3, mRNA
## XM_060035411.1   LOC132445431    transcript variant X4, mRNA
## XM_060035412.1   LOC132475957    transcript variant X2, mRNA
## 
## 
## # headers in original transcriptome file:
## 49417
## 
## # headers in formatted file:
## 49417
headers <- read.csv("../output/03.2-genome-annotation/transcriptome_headers_formatted.txt", sep="\t", header=FALSE, col.names=c("transcript_ID", "gene_ID", "transcript_type"))
# Join
genes_IDmap <- left_join(rna_id, headers, by=c("V1"="transcript_ID"))

# View output (columns gene_ID and transcript_type)
genes_IDmap %>%
  select(gene_ID, transcript_type) %>%
  head()
##        gene_ID             transcript_type
## 1 LOC132473465                        mRNA
## 2 LOC132475957 transcript variant X2, mRNA
## 3 LOC132445432 transcript variant X1, mRNA
## 4 LOC132445432 transcript variant X2, mRNA
## 5 LOC132445432 transcript variant X3, mRNA
## 6 LOC132445433                        mRNA

The output indicates we have gene_ID duplicates due to multiple transcript variants in the transcriptome file. We only want one entry for each gene_ID, since this is the column which will be used to join with our gene-level counts matrix. We need to reduce this ID mapping to only unique gene_IDs

# Reduce to unique gene_IDs 
genes_IDmap_unique <- genes_IDmap %>%
  distinct(gene_ID, .keep_all=TRUE)

# Check
genes_IDmap_unique %>%
  select(gene_ID, transcript_type) %>%
  head()
##        gene_ID             transcript_type
## 1 LOC132473465                        mRNA
## 2 LOC132475957 transcript variant X2, mRNA
## 3 LOC132445432 transcript variant X1, mRNA
## 4 LOC132445433                        mRNA
## 5        rbm46                        mRNA
## 6 LOC132445437                        mRNA
# Save to file
write.table(genes_IDmap_unique, "../output/03.2-genome-annotation/G_macrocephalus_genes_IDmapping_2024_05_22.tab", sep="\t")
LS0tCnRpdGxlOiAiMDMuMi1nZW5vbWUtYW5ub3RhdGlvbiIKYXV0aG9yOiAiS2F0aGxlZW4gRHVya2luIgpkYXRlOiAiMjAyNC0wNS0yMSIgCmFsd2F5c19hbGxvd19odG1sOiB0cnVlCm91dHB1dDogCiAgYm9va2Rvd246Omh0bWxfZG9jdW1lbnQyOgogICAgdGhlbWU6IGNvc21vCiAgICB0b2M6IHRydWUKICAgIHRvY19mbG9hdDogdHJ1ZQogICAgbnVtYmVyX3NlY3Rpb25zOiB0cnVlCiAgICBjb2RlX2ZvbGRpbmc6IHNob3cKICAgIGNvZGVfZG93bmxvYWQ6IHRydWUKICBnaXRodWJfZG9jdW1lbnQ6CiAgICB0b2M6IHRydWUKICAgIHRvY19kZXB0aDogMwogICAgbnVtYmVyX3NlY3Rpb25zOiB0cnVlCiAgICBodG1sX3ByZXZpZXc6IHRydWUgCi0tLQoKYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9CmxpYnJhcnkoa25pdHIpCmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KGthYmxlRXh0cmEpCmxpYnJhcnkoRFQpCmxpYnJhcnkoQmlvc3RyaW5ncykKbGlicmFyeSh0bSkKa25pdHI6Om9wdHNfY2h1bmskc2V0KAogIGVjaG8gPSBUUlVFLCAgICAgICAgICMgRGlzcGxheSBjb2RlIGNodW5rcwogIGV2YWwgPSBUUlVFLCAgICAgICAgICMgRXZhbHVhdGUgY29kZSBjaHVua3MKICB3YXJuaW5nID0gRkFMU0UsICAgICAjIEhpZGUgd2FybmluZ3MKICBtZXNzYWdlID0gRkFMU0UsICAgICAjIEhpZGUgbWVzc2FnZXMKICBmaWcud2lkdGggPSA2LCAgICAgICAjIFNldCBwbG90IHdpZHRoIGluIGluY2hlcwogIGZpZy5oZWlnaHQgPSA0LCAgICAgICMgU2V0IHBsb3QgaGVpZ2h0IGluIGluY2hlcwogIGZpZy5hbGlnbiA9ICJjZW50ZXIiICMgQWxpZ24gcGxvdHMgdG8gdGhlIGNlbnRlcgopCmBgYAoKV2UgbmVlZCB0byBnZW5lcmF0ZSBhIGRhdGFic2UgdGhhdCBhc3NvY2lhdGVzIGdlbmUgSURzIHdpdGggVW5pcHJvdCBBY2Vzc2lvbiBudW1iZXJzIGFuZCBHTyB0ZXJtcywgZm9yIHVzZSB3aXRoIG91ciBnZW5lLWxldmVsIGNvdW50IG1hdHJpY2VzIGdlbmVyYXRlZCB0aHJvdWdoIEhpc2F0Mi9mZWF0dXJlQ291bnRzLiBXZSBhbHJlYWR5IGhhdmUgYSB0cmFuc2NyaXB0b21lIGRhdGFiYXNlIChnZW5lcmF0ZWQgaW4gYDAzLXRyYW5zY3JpcHRvbWUtYW5ub3RhdGlvbmApLCBhbmQgdGhlIHRyYW5zY3JpcHRvbWUgaGFzIGFubm90YXRlZCBoZWFkZXJzIHRoYXQgaW5jbHVkZSBnZW5lIElEcy4gV2UgY2FuIHVzZSB0aGVzZSBoZWFkZXJzIHRvIGFkZCBnZW5lIElEcyB0byBvdXIgZXhpc3RpbmcgZGF0YWJhc2UuCgpGaXJzdCwgbG9hZCBvdXIgZXhpc3RpbmcgdHJhbnNjcmlwdG9tZSBkYXRhYmFzZSwgd2hpY2ggYXNzb2NpYXRlcyB0cmFuc2NyaXB0IElEcyB3aXRoIFVuaXByb3QgQWNlc3Npb24gbnVtYmVycyBhbmQgR08gdGVybXMgKGdlbmVyYXRlZCBpbiBgMDMtdHJhbnNjcmlwdG9tZS1hbm5vdGF0aW9uYCkKYGBge3IgbG9hZC10cmFuc2NyaXB0b21lLWRhdGFiYXNlfQojIExvYWQgb3VyIHRyYW5zY3JpcHQgSUQgLT4gVW5pcHJvdElEL0dPIHRlcm0gZGF0YWJhc2UKcm5hX2lkIDwtIHJlYWQuY3N2KCIuLi9vdXRwdXQvMDMtdHJhbnNjcmlwdG9tZS1hbm5vdGF0aW9uL0dfbWFjcm9jZXBoYWx1c19ybmFfSURtYXBwaW5nXzIwMjRfMDRfMTcudGFiIiwgc2VwPSJcdCIpCmBgYAoKTm93IGxldCdzIHZpZXcgdGhlIGhlYWRlciBmb3JtYXQgZnJvbSBvdXIgb3JpZ2luYWwgdHJhbnNjcmlwdG9tZSBmaWxlCmBgYHtyIHZpZXctdHJhbnNjcmlwdG9tZS1oZWFkZXJzLCBlbmdpbmU9J2Jhc2gnfQpncmVwICJePiIgLi4vZGF0YS9HQ0ZfMDMxMTY4OTU1LjFfQVNNMzExNjg5NXYxX3JuYS5mbmEgfCBoZWFkIC01CmBgYAoKV2UgbmVlZCB0byBleHRyYWN0IHRocmVlIHBpZWNlcyBvZiBpbmZvcm1hdGlvbiBmcm9tIHRoZXNlIGhlYWRlcnM6IHRoZSB0cmFuc2NyaXB0IElEIChlLmcuLCAiWE1fMDYwMDM1NDA4LjEiKSwgdGhlIGFzc29jaWF0ZWQgZ2VuZSBJRCAoZS5nLiwgIkxPQzEzMjQ3MzQ2NSIpLCBhbmQgdGhlIHR5cGUgKGUuZy4sICJtUk5BIikuCmBgYHtyIGZvcm1hdC1oZWFkZXJzLCBlbmdpbmU9J2Jhc2gnLCBldmFsPUZBTFNFfQpncmVwICJePiIgLi4vZGF0YS9HQ0ZfMDMxMTY4OTU1LjFfQVNNMzExNjg5NXYxX3JuYS5mbmEgfCBcCndoaWxlIHJlYWQgLXIgbGluZTsgZG8KICAgICMgRXh0cmFjdCB0aGUgc2VxdWVuY2UgSUQKICAgIHNlcV9pZD0kKGVjaG8gIiRsaW5lIiB8IGF3ayAtRic+fCAnICd7cHJpbnQgJDJ9JykKICAgICMgRXh0cmFjdCB0aGUgZ2VuZSBJRAogICAgZ2VuZV9JRD0kKGVjaG8gIiRsaW5lIiB8IGdyZXAgLW9QICdcKFteKCldKlwpKD89LFteKCldKiQpJyB8IGdyZXAgLW9QICcoPzw9XCgpLiooPz1cKSknKQogICAgIyBFeHRyYWN0IHRoZSBzZXF1ZW5jZSB0eXBlCiAgICBzZXFfdHlwZT0kKGVjaG8gIiRsaW5lIiB8IGF3ayAtRicpLCAnICd7cHJpbnQgJE5GfScpCiAgICAjIFByaW50IHRoZSBkZXNpcmVkIHRhYi1kZWxpbWl0ZWQgb3V0cHV0IGZvcm1hdAogICAgZWNobyAtZSAiJHtzZXFfaWR9XHQke2dlbmVfSUR9XHQke3NlcV90eXBlfSIKZG9uZSA+IC4uL291dHB1dC8wMy4yLWdlbm9tZS1hbm5vdGF0aW9uL3RyYW5zY3JpcHRvbWVfaGVhZGVyc19mb3JtYXR0ZWQudHh0CmBgYAoKYGBge3IgY2hlY2staGVhZGVycywgZW5naW5lPSdiYXNoJ30KIyBWaWV3IHNvbWUgZm9ybWF0dGVkIGhlYWRlcnMKaGVhZCAtNSAuLi9vdXRwdXQvMDMuMi1nZW5vbWUtYW5ub3RhdGlvbi90cmFuc2NyaXB0b21lX2hlYWRlcnNfZm9ybWF0dGVkLnR4dAoKZWNobyAiIgplY2hvICIiCgojIENoZWNrIHRoYXQgd2UgaGF2ZSB0aGUgc2FtZSBudW1iZXIgb2YgZm9ybWF0dGVkIGhlYWRlciBsaW5lcyBhcyBvcmlnaW5hbCBoZWFkZXIgbGluZXMKZWNobyAiIyBoZWFkZXJzIGluIG9yaWdpbmFsIHRyYW5zY3JpcHRvbWUgZmlsZToiCmdyZXAgIl4+IiAuLi9kYXRhL0dDRl8wMzExNjg5NTUuMV9BU00zMTE2ODk1djFfcm5hLmZuYSB8IHdjIC1sCmVjaG8iIgplY2hvICIjIGhlYWRlcnMgaW4gZm9ybWF0dGVkIGZpbGU6Igp3YyAtbCA8IC4uL291dHB1dC8wMy4yLWdlbm9tZS1hbm5vdGF0aW9uL3RyYW5zY3JpcHRvbWVfaGVhZGVyc19mb3JtYXR0ZWQudHh0CmBgYAoKYGBge3IgbG9hZC1mb3JtYXR0ZWQtaGVhZGVyLWRhdGF9CmhlYWRlcnMgPC0gcmVhZC5jc3YoIi4uL291dHB1dC8wMy4yLWdlbm9tZS1hbm5vdGF0aW9uL3RyYW5zY3JpcHRvbWVfaGVhZGVyc19mb3JtYXR0ZWQudHh0Iiwgc2VwPSJcdCIsIGhlYWRlcj1GQUxTRSwgY29sLm5hbWVzPWMoInRyYW5zY3JpcHRfSUQiLCAiZ2VuZV9JRCIsICJ0cmFuc2NyaXB0X3R5cGUiKSkKYGBgCgpgYGB7ciBqb2luLUlEbWFwcGluZy1hbmQtaGVhZGVyc30KIyBKb2luCmdlbmVzX0lEbWFwIDwtIGxlZnRfam9pbihybmFfaWQsIGhlYWRlcnMsIGJ5PWMoIlYxIj0idHJhbnNjcmlwdF9JRCIpKQoKIyBWaWV3IG91dHB1dCAoY29sdW1ucyBnZW5lX0lEIGFuZCB0cmFuc2NyaXB0X3R5cGUpCmdlbmVzX0lEbWFwICU+JQogIHNlbGVjdChnZW5lX0lELCB0cmFuc2NyaXB0X3R5cGUpICU+JQogIGhlYWQoKQpgYGAKCgpUaGUgb3V0cHV0IGluZGljYXRlcyB3ZSBoYXZlIGdlbmVfSUQgZHVwbGljYXRlcyBkdWUgdG8gbXVsdGlwbGUgdHJhbnNjcmlwdCB2YXJpYW50cyBpbiB0aGUgdHJhbnNjcmlwdG9tZSBmaWxlLiBXZSBvbmx5IHdhbnQgKm9uZSogZW50cnkgZm9yIGVhY2ggZ2VuZV9JRCwgc2luY2UgdGhpcyBpcyB0aGUgY29sdW1uIHdoaWNoIHdpbGwgYmUgdXNlZCB0byBqb2luIHdpdGggb3VyIGdlbmUtbGV2ZWwgY291bnRzIG1hdHJpeC4gV2UgbmVlZCB0byByZWR1Y2UgdGhpcyBJRCBtYXBwaW5nIHRvIG9ubHkgdW5pcXVlIGdlbmVfSURzCmBgYHtyIHJlZHVjZS10by11bmlxdWUtZ2VuZUlEc30KIyBSZWR1Y2UgdG8gdW5pcXVlIGdlbmVfSURzIApnZW5lc19JRG1hcF91bmlxdWUgPC0gZ2VuZXNfSURtYXAgJT4lCiAgZGlzdGluY3QoZ2VuZV9JRCwgLmtlZXBfYWxsPVRSVUUpCgojIENoZWNrCmdlbmVzX0lEbWFwX3VuaXF1ZSAlPiUKICBzZWxlY3QoZ2VuZV9JRCwgdHJhbnNjcmlwdF90eXBlKSAlPiUKICBoZWFkKCkKCiMgU2F2ZSB0byBmaWxlCndyaXRlLnRhYmxlKGdlbmVzX0lEbWFwX3VuaXF1ZSwgIi4uL291dHB1dC8wMy4yLWdlbm9tZS1hbm5vdGF0aW9uL0dfbWFjcm9jZXBoYWx1c19nZW5lc19JRG1hcHBpbmdfMjAyNF8wNV8yMi50YWIiLCBzZXA9Ilx0IikKYGBgCg==