We need to generate a databse that associates gene IDs with Uniprot Acession numbers and GO terms, for use with our gene-level count matrices generated through Hisat2/featureCounts. We already have a transcriptome database (generated in 03-transcriptome-annotation
), and the transcriptome has annotated headers that include gene IDs. We can use these headers to add gene IDs to our existing database.
First, load our existing transcriptome database, which associates transcript IDs with Uniprot Acession numbers and GO terms (generated in 03-transcriptome-annotation
)
# Load our transcript ID -> UniprotID/GO term database
rna_id <- read.csv("../output/03-transcriptome-annotation/G_macrocephalus_rna_IDmapping_2024_04_17.tab", sep="\t")
Now let’s view the header format from our original transcriptome file
grep "^>" ../data/GCF_031168955.1_ASM3116895v1_rna.fna | head -5
## >XM_060035408.1 PREDICTED: Gadus macrocephalus putative helicase MOV-10 (LOC132473465), mRNA
## >XM_060035409.1 PREDICTED: Gadus macrocephalus uncharacterized LOC132445431 (LOC132445431), transcript variant X2, mRNA
## >XM_060035410.1 PREDICTED: Gadus macrocephalus uncharacterized LOC132445431 (LOC132445431), transcript variant X3, mRNA
## >XM_060035411.1 PREDICTED: Gadus macrocephalus uncharacterized LOC132445431 (LOC132445431), transcript variant X4, mRNA
## >XM_060035412.1 PREDICTED: Gadus macrocephalus ATP-sensitive inward rectifier potassium channel 12-like (LOC132475957), transcript variant X2, mRNA
We need to extract three pieces of information from these headers: the transcript ID (e.g., “XM_060035408.1”), the associated gene ID (e.g., “LOC132473465”), and the type (e.g., “mRNA”).
grep "^>" ../data/GCF_031168955.1_ASM3116895v1_rna.fna | \
while read -r line; do
# Extract the sequence ID
seq_id=$(echo "$line" | awk -F'>| ' '{print $2}')
# Extract the gene ID
gene_ID=$(echo "$line" | grep -oP '\([^()]*\)(?=,[^()]*$)' | grep -oP '(?<=\().*(?=\))')
# Extract the sequence type
seq_type=$(echo "$line" | awk -F'), ' '{print $NF}')
# Print the desired tab-delimited output format
echo -e "${seq_id}\t${gene_ID}\t${seq_type}"
done > ../output/03.2-genome-annotation/transcriptome_headers_formatted.txt
# View some formatted headers
head -5 ../output/03.2-genome-annotation/transcriptome_headers_formatted.txt
echo ""
echo ""
# Check that we have the same number of formatted header lines as original header lines
echo "# headers in original transcriptome file:"
grep "^>" ../data/GCF_031168955.1_ASM3116895v1_rna.fna | wc -l
echo""
echo "# headers in formatted file:"
wc -l < ../output/03.2-genome-annotation/transcriptome_headers_formatted.txt
## XM_060035408.1 LOC132473465 mRNA
## XM_060035409.1 LOC132445431 transcript variant X2, mRNA
## XM_060035410.1 LOC132445431 transcript variant X3, mRNA
## XM_060035411.1 LOC132445431 transcript variant X4, mRNA
## XM_060035412.1 LOC132475957 transcript variant X2, mRNA
##
##
## # headers in original transcriptome file:
## 49417
##
## # headers in formatted file:
## 49417
headers <- read.csv("../output/03.2-genome-annotation/transcriptome_headers_formatted.txt", sep="\t", header=FALSE, col.names=c("transcript_ID", "gene_ID", "transcript_type"))
# Join
genes_IDmap <- left_join(rna_id, headers, by=c("V1"="transcript_ID"))
# View output (columns gene_ID and transcript_type)
genes_IDmap %>%
select(gene_ID, transcript_type) %>%
head()
## gene_ID transcript_type
## 1 LOC132473465 mRNA
## 2 LOC132475957 transcript variant X2, mRNA
## 3 LOC132445432 transcript variant X1, mRNA
## 4 LOC132445432 transcript variant X2, mRNA
## 5 LOC132445432 transcript variant X3, mRNA
## 6 LOC132445433 mRNA
The output indicates we have gene_ID duplicates due to multiple transcript variants in the transcriptome file. We only want one entry for each gene_ID, since this is the column which will be used to join with our gene-level counts matrix. We need to reduce this ID mapping to only unique gene_IDs
# Reduce to unique gene_IDs
genes_IDmap_unique <- genes_IDmap %>%
distinct(gene_ID, .keep_all=TRUE)
# Check
genes_IDmap_unique %>%
select(gene_ID, transcript_type) %>%
head()
## gene_ID transcript_type
## 1 LOC132473465 mRNA
## 2 LOC132475957 transcript variant X2, mRNA
## 3 LOC132445432 transcript variant X1, mRNA
## 4 LOC132445433 mRNA
## 5 rbm46 mRNA
## 6 LOC132445437 mRNA
# Save to file
write.table(genes_IDmap_unique, "../output/03.2-genome-annotation/G_macrocephalus_genes_IDmapping_2024_05_22.tab", sep="\t")
LS0tCnRpdGxlOiAiMDMuMi1nZW5vbWUtYW5ub3RhdGlvbiIKYXV0aG9yOiAiS2F0aGxlZW4gRHVya2luIgpkYXRlOiAiMjAyNC0wNS0yMSIgCmFsd2F5c19hbGxvd19odG1sOiB0cnVlCm91dHB1dDogCiAgYm9va2Rvd246Omh0bWxfZG9jdW1lbnQyOgogICAgdGhlbWU6IGNvc21vCiAgICB0b2M6IHRydWUKICAgIHRvY19mbG9hdDogdHJ1ZQogICAgbnVtYmVyX3NlY3Rpb25zOiB0cnVlCiAgICBjb2RlX2ZvbGRpbmc6IHNob3cKICAgIGNvZGVfZG93bmxvYWQ6IHRydWUKICBnaXRodWJfZG9jdW1lbnQ6CiAgICB0b2M6IHRydWUKICAgIHRvY19kZXB0aDogMwogICAgbnVtYmVyX3NlY3Rpb25zOiB0cnVlCiAgICBodG1sX3ByZXZpZXc6IHRydWUgCi0tLQoKYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9CmxpYnJhcnkoa25pdHIpCmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KGthYmxlRXh0cmEpCmxpYnJhcnkoRFQpCmxpYnJhcnkoQmlvc3RyaW5ncykKbGlicmFyeSh0bSkKa25pdHI6Om9wdHNfY2h1bmskc2V0KAogIGVjaG8gPSBUUlVFLCAgICAgICAgICMgRGlzcGxheSBjb2RlIGNodW5rcwogIGV2YWwgPSBUUlVFLCAgICAgICAgICMgRXZhbHVhdGUgY29kZSBjaHVua3MKICB3YXJuaW5nID0gRkFMU0UsICAgICAjIEhpZGUgd2FybmluZ3MKICBtZXNzYWdlID0gRkFMU0UsICAgICAjIEhpZGUgbWVzc2FnZXMKICBmaWcud2lkdGggPSA2LCAgICAgICAjIFNldCBwbG90IHdpZHRoIGluIGluY2hlcwogIGZpZy5oZWlnaHQgPSA0LCAgICAgICMgU2V0IHBsb3QgaGVpZ2h0IGluIGluY2hlcwogIGZpZy5hbGlnbiA9ICJjZW50ZXIiICMgQWxpZ24gcGxvdHMgdG8gdGhlIGNlbnRlcgopCmBgYAoKV2UgbmVlZCB0byBnZW5lcmF0ZSBhIGRhdGFic2UgdGhhdCBhc3NvY2lhdGVzIGdlbmUgSURzIHdpdGggVW5pcHJvdCBBY2Vzc2lvbiBudW1iZXJzIGFuZCBHTyB0ZXJtcywgZm9yIHVzZSB3aXRoIG91ciBnZW5lLWxldmVsIGNvdW50IG1hdHJpY2VzIGdlbmVyYXRlZCB0aHJvdWdoIEhpc2F0Mi9mZWF0dXJlQ291bnRzLiBXZSBhbHJlYWR5IGhhdmUgYSB0cmFuc2NyaXB0b21lIGRhdGFiYXNlIChnZW5lcmF0ZWQgaW4gYDAzLXRyYW5zY3JpcHRvbWUtYW5ub3RhdGlvbmApLCBhbmQgdGhlIHRyYW5zY3JpcHRvbWUgaGFzIGFubm90YXRlZCBoZWFkZXJzIHRoYXQgaW5jbHVkZSBnZW5lIElEcy4gV2UgY2FuIHVzZSB0aGVzZSBoZWFkZXJzIHRvIGFkZCBnZW5lIElEcyB0byBvdXIgZXhpc3RpbmcgZGF0YWJhc2UuCgpGaXJzdCwgbG9hZCBvdXIgZXhpc3RpbmcgdHJhbnNjcmlwdG9tZSBkYXRhYmFzZSwgd2hpY2ggYXNzb2NpYXRlcyB0cmFuc2NyaXB0IElEcyB3aXRoIFVuaXByb3QgQWNlc3Npb24gbnVtYmVycyBhbmQgR08gdGVybXMgKGdlbmVyYXRlZCBpbiBgMDMtdHJhbnNjcmlwdG9tZS1hbm5vdGF0aW9uYCkKYGBge3IgbG9hZC10cmFuc2NyaXB0b21lLWRhdGFiYXNlfQojIExvYWQgb3VyIHRyYW5zY3JpcHQgSUQgLT4gVW5pcHJvdElEL0dPIHRlcm0gZGF0YWJhc2UKcm5hX2lkIDwtIHJlYWQuY3N2KCIuLi9vdXRwdXQvMDMtdHJhbnNjcmlwdG9tZS1hbm5vdGF0aW9uL0dfbWFjcm9jZXBoYWx1c19ybmFfSURtYXBwaW5nXzIwMjRfMDRfMTcudGFiIiwgc2VwPSJcdCIpCmBgYAoKTm93IGxldCdzIHZpZXcgdGhlIGhlYWRlciBmb3JtYXQgZnJvbSBvdXIgb3JpZ2luYWwgdHJhbnNjcmlwdG9tZSBmaWxlCmBgYHtyIHZpZXctdHJhbnNjcmlwdG9tZS1oZWFkZXJzLCBlbmdpbmU9J2Jhc2gnfQpncmVwICJePiIgLi4vZGF0YS9HQ0ZfMDMxMTY4OTU1LjFfQVNNMzExNjg5NXYxX3JuYS5mbmEgfCBoZWFkIC01CmBgYAoKV2UgbmVlZCB0byBleHRyYWN0IHRocmVlIHBpZWNlcyBvZiBpbmZvcm1hdGlvbiBmcm9tIHRoZXNlIGhlYWRlcnM6IHRoZSB0cmFuc2NyaXB0IElEIChlLmcuLCAiWE1fMDYwMDM1NDA4LjEiKSwgdGhlIGFzc29jaWF0ZWQgZ2VuZSBJRCAoZS5nLiwgIkxPQzEzMjQ3MzQ2NSIpLCBhbmQgdGhlIHR5cGUgKGUuZy4sICJtUk5BIikuCmBgYHtyIGZvcm1hdC1oZWFkZXJzLCBlbmdpbmU9J2Jhc2gnLCBldmFsPUZBTFNFfQpncmVwICJePiIgLi4vZGF0YS9HQ0ZfMDMxMTY4OTU1LjFfQVNNMzExNjg5NXYxX3JuYS5mbmEgfCBcCndoaWxlIHJlYWQgLXIgbGluZTsgZG8KICAgICMgRXh0cmFjdCB0aGUgc2VxdWVuY2UgSUQKICAgIHNlcV9pZD0kKGVjaG8gIiRsaW5lIiB8IGF3ayAtRic+fCAnICd7cHJpbnQgJDJ9JykKICAgICMgRXh0cmFjdCB0aGUgZ2VuZSBJRAogICAgZ2VuZV9JRD0kKGVjaG8gIiRsaW5lIiB8IGdyZXAgLW9QICdcKFteKCldKlwpKD89LFteKCldKiQpJyB8IGdyZXAgLW9QICcoPzw9XCgpLiooPz1cKSknKQogICAgIyBFeHRyYWN0IHRoZSBzZXF1ZW5jZSB0eXBlCiAgICBzZXFfdHlwZT0kKGVjaG8gIiRsaW5lIiB8IGF3ayAtRicpLCAnICd7cHJpbnQgJE5GfScpCiAgICAjIFByaW50IHRoZSBkZXNpcmVkIHRhYi1kZWxpbWl0ZWQgb3V0cHV0IGZvcm1hdAogICAgZWNobyAtZSAiJHtzZXFfaWR9XHQke2dlbmVfSUR9XHQke3NlcV90eXBlfSIKZG9uZSA+IC4uL291dHB1dC8wMy4yLWdlbm9tZS1hbm5vdGF0aW9uL3RyYW5zY3JpcHRvbWVfaGVhZGVyc19mb3JtYXR0ZWQudHh0CmBgYAoKYGBge3IgY2hlY2staGVhZGVycywgZW5naW5lPSdiYXNoJ30KIyBWaWV3IHNvbWUgZm9ybWF0dGVkIGhlYWRlcnMKaGVhZCAtNSAuLi9vdXRwdXQvMDMuMi1nZW5vbWUtYW5ub3RhdGlvbi90cmFuc2NyaXB0b21lX2hlYWRlcnNfZm9ybWF0dGVkLnR4dAoKZWNobyAiIgplY2hvICIiCgojIENoZWNrIHRoYXQgd2UgaGF2ZSB0aGUgc2FtZSBudW1iZXIgb2YgZm9ybWF0dGVkIGhlYWRlciBsaW5lcyBhcyBvcmlnaW5hbCBoZWFkZXIgbGluZXMKZWNobyAiIyBoZWFkZXJzIGluIG9yaWdpbmFsIHRyYW5zY3JpcHRvbWUgZmlsZToiCmdyZXAgIl4+IiAuLi9kYXRhL0dDRl8wMzExNjg5NTUuMV9BU00zMTE2ODk1djFfcm5hLmZuYSB8IHdjIC1sCmVjaG8iIgplY2hvICIjIGhlYWRlcnMgaW4gZm9ybWF0dGVkIGZpbGU6Igp3YyAtbCA8IC4uL291dHB1dC8wMy4yLWdlbm9tZS1hbm5vdGF0aW9uL3RyYW5zY3JpcHRvbWVfaGVhZGVyc19mb3JtYXR0ZWQudHh0CmBgYAoKYGBge3IgbG9hZC1mb3JtYXR0ZWQtaGVhZGVyLWRhdGF9CmhlYWRlcnMgPC0gcmVhZC5jc3YoIi4uL291dHB1dC8wMy4yLWdlbm9tZS1hbm5vdGF0aW9uL3RyYW5zY3JpcHRvbWVfaGVhZGVyc19mb3JtYXR0ZWQudHh0Iiwgc2VwPSJcdCIsIGhlYWRlcj1GQUxTRSwgY29sLm5hbWVzPWMoInRyYW5zY3JpcHRfSUQiLCAiZ2VuZV9JRCIsICJ0cmFuc2NyaXB0X3R5cGUiKSkKYGBgCgpgYGB7ciBqb2luLUlEbWFwcGluZy1hbmQtaGVhZGVyc30KIyBKb2luCmdlbmVzX0lEbWFwIDwtIGxlZnRfam9pbihybmFfaWQsIGhlYWRlcnMsIGJ5PWMoIlYxIj0idHJhbnNjcmlwdF9JRCIpKQoKIyBWaWV3IG91dHB1dCAoY29sdW1ucyBnZW5lX0lEIGFuZCB0cmFuc2NyaXB0X3R5cGUpCmdlbmVzX0lEbWFwICU+JQogIHNlbGVjdChnZW5lX0lELCB0cmFuc2NyaXB0X3R5cGUpICU+JQogIGhlYWQoKQpgYGAKCgpUaGUgb3V0cHV0IGluZGljYXRlcyB3ZSBoYXZlIGdlbmVfSUQgZHVwbGljYXRlcyBkdWUgdG8gbXVsdGlwbGUgdHJhbnNjcmlwdCB2YXJpYW50cyBpbiB0aGUgdHJhbnNjcmlwdG9tZSBmaWxlLiBXZSBvbmx5IHdhbnQgKm9uZSogZW50cnkgZm9yIGVhY2ggZ2VuZV9JRCwgc2luY2UgdGhpcyBpcyB0aGUgY29sdW1uIHdoaWNoIHdpbGwgYmUgdXNlZCB0byBqb2luIHdpdGggb3VyIGdlbmUtbGV2ZWwgY291bnRzIG1hdHJpeC4gV2UgbmVlZCB0byByZWR1Y2UgdGhpcyBJRCBtYXBwaW5nIHRvIG9ubHkgdW5pcXVlIGdlbmVfSURzCmBgYHtyIHJlZHVjZS10by11bmlxdWUtZ2VuZUlEc30KIyBSZWR1Y2UgdG8gdW5pcXVlIGdlbmVfSURzIApnZW5lc19JRG1hcF91bmlxdWUgPC0gZ2VuZXNfSURtYXAgJT4lCiAgZGlzdGluY3QoZ2VuZV9JRCwgLmtlZXBfYWxsPVRSVUUpCgojIENoZWNrCmdlbmVzX0lEbWFwX3VuaXF1ZSAlPiUKICBzZWxlY3QoZ2VuZV9JRCwgdHJhbnNjcmlwdF90eXBlKSAlPiUKICBoZWFkKCkKCiMgU2F2ZSB0byBmaWxlCndyaXRlLnRhYmxlKGdlbmVzX0lEbWFwX3VuaXF1ZSwgIi4uL291dHB1dC8wMy4yLWdlbm9tZS1hbm5vdGF0aW9uL0dfbWFjcm9jZXBoYWx1c19nZW5lc19JRG1hcHBpbmdfMjAyNF8wNV8yMi50YWIiLCBzZXA9Ilx0IikKYGBgCg==