This notebook reformats the original P.evermanni GFF (Porites_evermanni_v1.annot.gff), which is not compliant with the GFF standard (GitHub page). The GFF is lacking the gene
feature, which may (or may not) be needed/useful for downstream processing. This notebook adds a gene
feature. Additionally, it is lacking the exon
feature. We’ve decided to insert exon
features which cover UTR
and CDS
features.
Finally, despite the naming convention, there aren’t any actual annotations in that GFF, beyond the feature designations (i.e. no gene ontology, no SwissProt IDs, gene names, etc.). This notebook does not address those shortcomings.
Unlike other scripts, this will output to E-Peve/data, instead of an output directory in ../output
.
Requires genometools (GitHub repo) to be installed and in the system $PATH
.
Requires AGAT to be installed via conda/mamba for conversion to GTF.
This allows usage of Bash variables across R Markdown chunks.
{
echo "#### Assign Variables ####"
echo ""
echo "# Data directories"
echo 'export repo_dir=~/gitrepos/urol-e5/timeseries_molecular'
echo 'export data_dir=${repo_dir}/E-Peve/data'
echo ""
echo "# Input files"
echo 'export original_gff="Porites_evermanni_v1.annot.gff"'
echo 'export intermediate_gff="intermediate.gff"'
echo 'export validated_gff="Porites_evermanni_validated.gff3"'
echo ""
echo "# Output files"
echo 'export gtf="Porites_evermanni_validated.gtf"'
echo ""
echo "# Programs"
echo 'export conda_path="/home/sam/programs/miniforge3-24.7.1-0/bin/mamba"'
echo 'export conda_env_name="agat_env"'
echo ""
echo "# Print formatting"
echo 'export line="--------------------------------------------------------"'
echo ""
} > .bashvars
cat .bashvars
#### Assign Variables ####
# Data directories
export repo_dir=~/gitrepos/urol-e5/timeseries_molecular
export data_dir=${repo_dir}/E-Peve/data
# Input files
export original_gff="Porites_evermanni_v1.annot.gff"
export intermediate_gff="intermediate.gff"
export validated_gff="Porites_evermanni_validated.gff3"
# Output files
export gtf="Porites_evermanni_validated.gtf"
# Programs
export conda_path="/home/sam/programs/miniforge3-24.7.1-0/bin/mamba"
export conda_env_name="agat_env"
# Print formatting
export line="--------------------------------------------------------"
source .bashvars
head "${data_dir}/${original_gff}"
Porites_evermani_scaffold_1 Gmove mRNA 3107 4488 543 - . ID=Peve_00000001;Name=Peve_00000001;start=0;stop=1;cds_size=543
Porites_evermani_scaffold_1 Gmove CDS 3107 3444 . - . Parent=Peve_00000001
Porites_evermani_scaffold_1 Gmove CDS 4284 4488 . - . Parent=Peve_00000001
Porites_evermani_scaffold_1 Gmove mRNA 424479 429034 2439.63 - . ID=Peve_00000002;Name=Peve_00000002;start=1;stop=1;cds_size=2019
Porites_evermani_scaffold_1 Gmove CDS 424479 425361 . - . Parent=Peve_00000002
Porites_evermani_scaffold_1 Gmove CDS 426181 426735 . - . Parent=Peve_00000002
Porites_evermani_scaffold_1 Gmove CDS 427013 427140 . - . Parent=Peve_00000002
Porites_evermani_scaffold_1 Gmove CDS 427665 427724 . - . Parent=Peve_00000002
Porites_evermani_scaffold_1 Gmove CDS 428642 429034 . - . Parent=Peve_00000002
Porites_evermani_scaffold_1 Gmove mRNA 429394 438909 1570.66 + . ID=Peve_00000003;Name=Peve_00000003;start=1;stop=1;cds_size=1458
source .bashvars
awk '{print $3}' "${data_dir}/${original_gff}" | sort --unique
CDS
mRNA
UTR
source .bashvars
awk '
BEGIN { OFS="\t"; mrna_count = 0; utr_count = 0; gene_count = 0; cds_count = 0; exon_count = 0 }
{
if ($3 == "mRNA") {
split($9, attributes, ";")
for (i in attributes) {
if (attributes[i] ~ /^ID=/) {
original_id = substr(attributes[i], 4)
gene_id = "ID=gene-" original_id
parent_id = "Parent=gene-" original_id
break
}
}
# Increment the global mRNA counter
mrna_count++
new_mrna_id = "ID=mrna-" sprintf("%05d", mrna_count)
# Store the mapping of original mRNA ID to new mRNA ID
mrna_map[original_id] = "mrna-" sprintf("%05d", mrna_count)
# Replace the old ID with the new mRNA ID
for (i in attributes) {
if (attributes[i] ~ /^ID=/) {
attributes[i] = new_mrna_id
break
}
}
$9 = attributes[1]
for (i = 2; i <= length(attributes); i++) {
$9 = $9 ";" attributes[i]
}
$9 = $9 ";" parent_id
print $1, $2, "gene", $4, $5, ".", $7, $8, gene_id
print $0 # Print the original mRNA feature
# Increment the gene counter and reset the CDS counter for each new gene
gene_count++
} else if ($3 == "UTR" || $3 == "CDS") {
split($9, attributes, ";")
id_set = 0
parent_set = 0
for (i in attributes) {
if (attributes[i] ~ /^ID=/) {
id_set = 1
if ($3 == "UTR") {
attributes[i] = "ID=utr-" sprintf("%05d", ++utr_count)
} else if ($3 == "CDS") {
attributes[i] = "ID=cds-" sprintf("%05d", ++cds_count)
}
}
if (attributes[i] ~ /^Parent=/) {
parent_set = 1
original_parent_id = substr(attributes[i], 8)
if (original_parent_id in mrna_map) {
attributes[i] = "Parent=" mrna_map[original_parent_id]
}
}
}
if (id_set == 0) {
if ($3 == "UTR") {
attributes[1] = "ID=utr-" sprintf("%05d", ++utr_count) ";" attributes[1]
} else if ($3 == "CDS") {
attributes[1] = "ID=cds-" sprintf("%05d", ++cds_count) ";" attributes[1]
}
}
if (parent_set == 0) {
attributes[length(attributes) + 1] = "Parent=" mrna_map[original_id]
}
$9 = attributes[1]
for (i = 2; i <= length(attributes); i++) {
$9 = $9 ";" attributes[i]
}
print $1, $2, $3, $4, $5, $6, $7, $8, $9 # Print the original UTR or CDS feature
# Add exon feature
exon_count++
new_exon_id = "ID=exon-" sprintf("%06d", exon_count)
exon_attributes = new_exon_id
for (i = 1; i <= length(attributes); i++) {
if (attributes[i] !~ /^ID=/) {
exon_attributes = exon_attributes ";" attributes[i]
}
}
# Ensure Parent attribute is included in exon attributes
if (parent_set == 1) {
exon_attributes = exon_attributes ";Parent=" mrna_map[original_parent_id]
}
print $1, $2, "exon", $4, $5, $6, $7, $8, exon_attributes
}
}
' "${data_dir}/${original_gff}" > "${data_dir}/${intermediate_gff}"
source .bashvars
head "${data_dir}"/"${intermediate_gff}"
Porites_evermani_scaffold_1 Gmove gene 3107 4488 . - . ID=gene-Peve_00000001
Porites_evermani_scaffold_1 Gmove mRNA 3107 4488 543 - . ID=mrna-00001;Name=Peve_00000001;start=0;stop=1;cds_size=543;Parent=gene-Peve_00000001
Porites_evermani_scaffold_1 Gmove CDS 3107 3444 . - . ID=cds-00001;Parent=mrna-00001
Porites_evermani_scaffold_1 Gmove exon 3107 3444 . - . ID=exon-000001;Parent=mrna-00001
Porites_evermani_scaffold_1 Gmove CDS 4284 4488 . - . ID=cds-00002;Parent=mrna-00001
Porites_evermani_scaffold_1 Gmove exon 4284 4488 . - . ID=exon-000002;Parent=mrna-00001
Porites_evermani_scaffold_1 Gmove gene 424479 429034 . - . ID=gene-Peve_00000002
Porites_evermani_scaffold_1 Gmove mRNA 424479 429034 2439.63 - . ID=mrna-00002;Name=Peve_00000002;start=1;stop=1;cds_size=2019;Parent=gene-Peve_00000002
Porites_evermani_scaffold_1 Gmove CDS 424479 425361 . - . ID=cds-00003;Parent=mrna-00002
Porites_evermani_scaffold_1 Gmove exon 424479 425361 . - . ID=exon-000003;Parent=mrna-00002
Validate GFF using genometools gff3
.
-tidy
: Attempts to clean/fix any potential issues.
-checkids
: Checks IDs.
-retainids
: Retains IDs from input GFF instead of assigning new ones.
source .bashvars
gt gff3 \
-tidy \
-checkids \
-retainids \
-sort \
"${data_dir}"/"${intermediate_gff}" \
> "${data_dir}"/"${validated_gff}" \
2> "${data_dir}"/gt_gff3_validation_errors.log
Process would stop if error occurred, so only need to check end of file.
Warnings are expected, as they generally indicated modifications to the GFF to bring it into compliance.
source .bashvars
tail "${data_dir}"/gt_gff3_validation_errors.log
warning: CDS feature on line 81189 in file "/home/sam/gitrepos/urol-e5/timeseries_molecular/E-Peve/data/intermediate.gff" has the wrong phase . -> correcting it to 0
warning: CDS feature on line 81191 in file "/home/sam/gitrepos/urol-e5/timeseries_molecular/E-Peve/data/intermediate.gff" has the wrong phase . -> correcting it to 2
warning: CDS feature on line 81193 in file "/home/sam/gitrepos/urol-e5/timeseries_molecular/E-Peve/data/intermediate.gff" has the wrong phase . -> correcting it to 2
warning: CDS feature on line 81197 in file "/home/sam/gitrepos/urol-e5/timeseries_molecular/E-Peve/data/intermediate.gff" has the wrong phase . -> correcting it to 0
warning: CDS feature on line 81199 in file "/home/sam/gitrepos/urol-e5/timeseries_molecular/E-Peve/data/intermediate.gff" has the wrong phase . -> correcting it to 0
warning: CDS feature on line 81203 in file "/home/sam/gitrepos/urol-e5/timeseries_molecular/E-Peve/data/intermediate.gff" has the wrong phase . -> correcting it to 0
warning: CDS feature on line 81205 in file "/home/sam/gitrepos/urol-e5/timeseries_molecular/E-Peve/data/intermediate.gff" has the wrong phase . -> correcting it to 0
warning: CDS feature on line 81207 in file "/home/sam/gitrepos/urol-e5/timeseries_molecular/E-Peve/data/intermediate.gff" has the wrong phase . -> correcting it to 2
warning: CDS feature on line 81209 in file "/home/sam/gitrepos/urol-e5/timeseries_molecular/E-Peve/data/intermediate.gff" has the wrong phase . -> correcting it to 0
warning: CDS feature on line 81211 in file "/home/sam/gitrepos/urol-e5/timeseries_molecular/E-Peve/data/intermediate.gff" has the wrong phase . -> correcting it to 0
source .bashvars
head "${data_dir}"/"${validated_gff}"
##gff-version 3
##sequence-region Porites_evermani_scaffold_1 3107 1802489
##sequence-region Porites_evermani_scaffold_10 16486 1155879
##sequence-region Porites_evermani_scaffold_100 2448 527357
##sequence-region Porites_evermani_scaffold_1000 5675 163688
##sequence-region Porites_evermani_scaffold_1001 5840 151448
##sequence-region Porites_evermani_scaffold_1002 3372 125467
##sequence-region Porites_evermani_scaffold_1003 139 162450
##sequence-region Porites_evermani_scaffold_1004 10061 159112
##sequence-region Porites_evermani_scaffold_1005 28090 152503
source .bashvars
rm "${data_dir}"/"${intermediate_gff}"
source .bashvars
# Sends progress to /dev/null to avoid crashing Rmd notebook
"${conda_path}" run -n ${conda_env_name} \
agat_convert_sp_gff2gtf.pl \
--gff "${data_dir}"/"${validated_gff}" \
--output "${data_dir}"/${gtf} \
1> "${data_dir}"/agat.log \
2> /dev/null
source .bashvars
cat "${data_dir}"/agat.log
converting to GTF3
********************************************************************************
* - Start parsing - *
********************************************************************************
-------------------------- parse options and metadata --------------------------
=> Accessing the feature level json files
Using standard /home/sam/programs/miniforge3-24.7.1-0/envs/agat_env/lib/site_perl/5.26.2/auto/share/dist/AGAT/features_level1.json file
Using standard /home/sam/programs/miniforge3-24.7.1-0/envs/agat_env/lib/site_perl/5.26.2/auto/share/dist/AGAT/features_level2.json file
Using standard /home/sam/programs/miniforge3-24.7.1-0/envs/agat_env/lib/site_perl/5.26.2/auto/share/dist/AGAT/features_level3.json file
Using standard /home/sam/programs/miniforge3-24.7.1-0/envs/agat_env/lib/site_perl/5.26.2/auto/share/dist/AGAT/features_spread.json file
=> Attribute used to group features when no Parent/ID relationship exists:
* locus_tag
* gene_id
=> merge_loci option deactivated
=> Accessing Ontology
No ontology accessible from the gff file header!
We use the SOFA ontology distributed with AGAT:
/home/sam/programs/miniforge3-24.7.1-0/envs/agat_env/lib/site_perl/5.26.2/auto/share/dist/AGAT/so.obo
Read ontology /home/sam/programs/miniforge3-24.7.1-0/envs/agat_env/lib/site_perl/5.26.2/auto/share/dist/AGAT/so.obo:
4 root terms, and 2472 total terms, and 1436 leaf terms
Filtering ontology:
We found 1757 terms that are sequence_feature or is_a child of it.
-------------------------------- parse features --------------------------------
=> GFF parser version used: 3
********************************************************************************
* - End parsing - *
* done in 109 seconds *
********************************************************************************
********************************************************************************
* - Start checks - *
********************************************************************************
---------------------------- Check1: feature types -----------------------------
----------------------------------- ontology -----------------------------------
All feature types in agreement with the Ontology.
------------------------------------- agat -------------------------------------
AGAT can deal with all the encountered feature types (3rd column)
------------------------------ done in 0 seconds -------------------------------
------------------------------ Check2: duplicates ------------------------------
None found
------------------------------ done in 0 seconds -------------------------------
-------------------------- Check3: sequential bucket ---------------------------
Nothing to check as sequential bucket!
------------------------------ done in 0 seconds -------------------------------
--------------------------- Check4: l2 linked to l3 ----------------------------
No problem found
------------------------------ done in 0 seconds -------------------------------
--------------------------- Check5: l1 linked to l2 ----------------------------
No problem found
------------------------------ done in 0 seconds -------------------------------
--------------------------- Check6: remove orphan l1 ---------------------------
We remove only those not supposed to be orphan
None found
------------------------------ done in 1 seconds -------------------------------
------------------------------ Check7: check cds -------------------------------
No problem found
------------------------------ done in 0 seconds -------------------------------
----------------------------- Check8: check exons ------------------------------
No exons created
No exons locations modified
10475 exons removed that were supernumerary
No level2 locations modified
------------------------------ done in 11 seconds ------------------------------
------------------------------ Check9: check utrs ------------------------------
No UTRs created
No UTRs locations modified
No supernumerary UTRs removed
------------------------------ done in 6 seconds -------------------------------
------------------------ Check10: all level2 locations -------------------------
No problem found
------------------------------ done in 5 seconds -------------------------------
------------------------ Check11: all level1 locations -------------------------
No problem found
------------------------------ done in 1 seconds -------------------------------
---------------------- Check12: remove identical isoforms ----------------------
None found
------------------------------ done in 0 seconds -------------------------------
********************************************************************************
* - End checks - *
* done in 24 seconds *
********************************************************************************
=> OmniscientI total time: 133 seconds
Bye Bye
source .bashvars
head "${data_dir}"/"${gtf}"
##gtf-version 3
##sequence-region Porites_evermani_scaffold_1 3107 1802489
##sequence-region Porites_evermani_scaffold_10 16486 1155879
##sequence-region Porites_evermani_scaffold_100 2448 527357
##sequence-region Porites_evermani_scaffold_1000 5675 163688
##sequence-region Porites_evermani_scaffold_1001 5840 151448
##sequence-region Porites_evermani_scaffold_1002 3372 125467
##sequence-region Porites_evermani_scaffold_1003 139 162450
##sequence-region Porites_evermani_scaffold_1004 10061 159112
##sequence-region Porites_evermani_scaffold_1005 28090 152503