This notebook reformats the original P.tuahiniensis GFF (Pocillopora_meandrina_HIv1.genes.gff3), to be compliant with the GFF3 standard. The genome was originally downloaded from here:
Despite the naming convention of the file, there are no designated gene
regions. The reformatting will add a gene
feature, based solely on the transcript
coordinates. There aren’t any other features beyoned CDS
and exon
, so adding gene
features which match the transcript
regions is primarily just to meet the GFF3 standard.
Additionally, this notebook will generate a GTF for downstream use with HiSat2 for creating a genome index which incorporates exon/intron junctions and splice sites.
Unlike other scripts, this will output to F-Ptuh/data
, instead of an output directory in ../output
.
Requires genometools (GitHub repo) to be installed and in the system $PATH
and is used for GFF3 validation/formatting.
Requires gffread (Pertea and Pertea 2020) which is used for GFF3-to-GTF conversion.
library(tidyverse)
This allows usage of Bash variables across R Markdown chunks.
{
echo "#### Assign Variables ####"
echo ""
echo "# Programs"
echo 'export gffread=~/programs/gffread/gffread'
echo "# Data directories"
echo 'export repo_dir=~/gitrepos/urol-e5/deep-dive-expression'
echo 'export data_dir=${repo_dir}/F-Ptuh/data'
echo ""
echo "# Input files"
echo 'export original_gff="Pocillopora_meandrina_HIv1.genes.gff3"'
echo 'export intermediate_gff="intermediate.gff3"'
echo 'export validated_gff="Pocillopora_meandrina_HIv1.genes-validated.gff3"'
echo 'export gtf="Pocillopora_meandrina_HIv1.genes-validated.gtf"'
echo ""
echo "# Print formatting"
echo 'export line="--------------------------------------------------------"'
echo ""
} > .bashvars
cat .bashvars
#### Assign Variables ####
# Programs
export gffread=~/programs/gffread/gffread
# Data directories
export repo_dir=~/gitrepos/urol-e5/deep-dive-expression
export data_dir=${repo_dir}/F-Ptuh/data
# Input files
export original_gff="Pocillopora_meandrina_HIv1.genes.gff3"
export intermediate_gff="intermediate.gff3"
export validated_gff="Pocillopora_meandrina_HIv1.genes-validated.gff3"
export gtf="Pocillopora_meandrina_HIv1.genes-validated.gtf"
# Print formatting
export line="--------------------------------------------------------"
gff <- read_delim("../data/Pocillopora_meandrina_HIv1.genes.gff3",
delim="\t",
col_names=c("seqid", "source", "type", "start", "end",
"score", "strand", "phase", "attributes"),
show_col_types = FALSE)
str(gff)
spc_tbl_ [448,910 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ seqid : chr [1:448910] "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" ...
$ source : chr [1:448910] "AUGUSTUS" "AUGUSTUS" "AUGUSTUS" "AUGUSTUS" ...
$ type : chr [1:448910] "transcript" "CDS" "exon" "CDS" ...
$ start : num [1:448910] 9649 9649 9649 10225 10225 ...
$ end : num [1:448910] 18299 9706 9706 10352 10352 ...
$ score : chr [1:448910] "." "." "." "." ...
$ strand : chr [1:448910] "-" "-" "-" "-" ...
$ phase : chr [1:448910] "." "1" "1" "0" ...
$ attributes: chr [1:448910] "ID=Pocillopora_meandrina_HIv1___TS.g23774.t1" "Parent=Pocillopora_meandrina_HIv1___TS.g23774.t1" "Parent=Pocillopora_meandrina_HIv1___TS.g23774.t1" "Parent=Pocillopora_meandrina_HIv1___TS.g23774.t1" ...
- attr(*, "spec")=
.. cols(
.. seqid = col_character(),
.. source = col_character(),
.. type = col_character(),
.. start = col_double(),
.. end = col_double(),
.. score = col_character(),
.. strand = col_character(),
.. phase = col_character(),
.. attributes = col_character()
.. )
- attr(*, "problems")=<externalptr>
# Get and print unique types, one per line
cat("Unique feature types in original GFF3:\n")
Unique feature types in original GFF3:
gff %>%
distinct(type) %>%
pull(type) %>%
walk(~cat("-", .x, "\n"))
- transcript
- CDS
- exon
# Create gene entries
gene_entries <- gff %>%
filter(type == "transcript") %>%
mutate(
type = "gene",
attributes = paste0("ID=gene-", str_remove(attributes, "^ID="))
)
str(gene_entries)
tibble [31,840 × 9] (S3: tbl_df/tbl/data.frame)
$ seqid : chr [1:31840] "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" ...
$ source : chr [1:31840] "AUGUSTUS" "AUGUSTUS" "AUGUSTUS" "AUGUSTUS" ...
$ type : chr [1:31840] "gene" "gene" "gene" "gene" ...
$ start : num [1:31840] 9649 36139 58831 73152 93651 ...
$ end : num [1:31840] 18299 36387 71330 87762 96614 ...
$ score : chr [1:31840] "." "." "." "." ...
$ strand : chr [1:31840] "-" "+" "+" "+" ...
$ phase : chr [1:31840] "." "." "." "." ...
$ attributes: chr [1:31840] "ID=gene-Pocillopora_meandrina_HIv1___TS.g23774.t1" "ID=gene-Pocillopora_meandrina_HIv1___RNAseq.g5056.t1" "ID=gene-Pocillopora_meandrina_HIv1___RNAseq.g5057.t1" "ID=gene-Pocillopora_meandrina_HIv1___RNAseq.g5058.t1" ...
# Create mRNA entries
mrna_entries <- gff %>%
filter(type == "transcript") %>%
mutate(
type = "mRNA",
gene_id = paste0("gene-", str_remove(attributes, "^ID=")),
attributes = paste0("ID=mrna-", str_remove(attributes, "^ID="),
";Parent=", gene_id)
) %>%
select(-gene_id)
str(mrna_entries)
tibble [31,840 × 9] (S3: tbl_df/tbl/data.frame)
$ seqid : chr [1:31840] "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" ...
$ source : chr [1:31840] "AUGUSTUS" "AUGUSTUS" "AUGUSTUS" "AUGUSTUS" ...
$ type : chr [1:31840] "mRNA" "mRNA" "mRNA" "mRNA" ...
$ start : num [1:31840] 9649 36139 58831 73152 93651 ...
$ end : num [1:31840] 18299 36387 71330 87762 96614 ...
$ score : chr [1:31840] "." "." "." "." ...
$ strand : chr [1:31840] "-" "+" "+" "+" ...
$ phase : chr [1:31840] "." "." "." "." ...
$ attributes: chr [1:31840] "ID=mrna-Pocillopora_meandrina_HIv1___TS.g23774.t1;Parent=gene-Pocillopora_meandrina_HIv1___TS.g23774.t1" "ID=mrna-Pocillopora_meandrina_HIv1___RNAseq.g5056.t1;Parent=gene-Pocillopora_meandrina_HIv1___RNAseq.g5056.t1" "ID=mrna-Pocillopora_meandrina_HIv1___RNAseq.g5057.t1;Parent=gene-Pocillopora_meandrina_HIv1___RNAseq.g5057.t1" "ID=mrna-Pocillopora_meandrina_HIv1___RNAseq.g5058.t1;Parent=gene-Pocillopora_meandrina_HIv1___RNAseq.g5058.t1" ...
Prepends a feature ID (e.g. cds-
) to the corresponding attribute ID.
Also increments each exon
feature type within each mRNA, starting at `.
# Process remaining features
other_features <- gff %>%
group_by(seqid, group_id = cumsum(type == "transcript")) %>%
mutate(
mrna_id = first(if_else(type == "transcript",
paste0("mrna-", str_remove(attributes, "^ID=")),
NA_character_)),
# Calculate exon number within each group
exon_num = if_else(type == "exon",
dense_rank(if_else(type == "exon", row_number(), NA_real_)),
NA_real_),
# Format attributes
attributes = if_else(
type != "transcript",
paste0(
"ID=", tolower(type), "-", str_remove(first(if_else(type == "transcript",
attributes,
NA_character_)), "^ID="),
if_else(!is.na(exon_num),
paste0("-", exon_num), ""),
";Parent=", mrna_id
),
attributes
)
) %>%
ungroup() %>%
filter(type != "transcript") %>%
select(-group_id, -mrna_id, -exon_num)
str(other_features)
tibble [417,070 × 9] (S3: tbl_df/tbl/data.frame)
$ seqid : chr [1:417070] "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" ...
$ source : chr [1:417070] "AUGUSTUS" "AUGUSTUS" "AUGUSTUS" "AUGUSTUS" ...
$ type : chr [1:417070] "CDS" "exon" "CDS" "exon" ...
$ start : num [1:417070] 9649 9649 10225 10225 11330 ...
$ end : num [1:417070] 9706 9706 10352 10352 11655 ...
$ score : chr [1:417070] "." "." "." "." ...
$ strand : chr [1:417070] "-" "-" "-" "-" ...
$ phase : chr [1:417070] "1" "1" "0" "0" ...
$ attributes: chr [1:417070] "ID=cds-Pocillopora_meandrina_HIv1___TS.g23774.t1;Parent=mrna-Pocillopora_meandrina_HIv1___TS.g23774.t1" "ID=exon-Pocillopora_meandrina_HIv1___TS.g23774.t1-1;Parent=mrna-Pocillopora_meandrina_HIv1___TS.g23774.t1" "ID=cds-Pocillopora_meandrina_HIv1___TS.g23774.t1;Parent=mrna-Pocillopora_meandrina_HIv1___TS.g23774.t1" "ID=exon-Pocillopora_meandrina_HIv1___TS.g23774.t1-2;Parent=mrna-Pocillopora_meandrina_HIv1___TS.g23774.t1" ...
intermediate_gff <- bind_rows(
gene_entries,
mrna_entries,
other_features
) %>%
arrange(seqid, start)
str(intermediate_gff)
tibble [480,750 × 9] (S3: tbl_df/tbl/data.frame)
$ seqid : chr [1:480750] "Pocillopora_meandrina_HIv1___Sc0000000" "Pocillopora_meandrina_HIv1___Sc0000000" "Pocillopora_meandrina_HIv1___Sc0000000" "Pocillopora_meandrina_HIv1___Sc0000000" ...
$ source : chr [1:480750] "AUGUSTUS" "AUGUSTUS" "AUGUSTUS" "AUGUSTUS" ...
$ type : chr [1:480750] "gene" "mRNA" "CDS" "exon" ...
$ start : num [1:480750] 10771 10771 10771 10771 12784 ...
$ end : num [1:480750] 23652 23652 11117 11117 12875 ...
$ score : chr [1:480750] "." "." "." "." ...
$ strand : chr [1:480750] "+" "+" "+" "+" ...
$ phase : chr [1:480750] "." "." "0" "0" ...
$ attributes: chr [1:480750] "ID=gene-Pocillopora_meandrina_HIv1___RNAseq.g20902.t1" "ID=mrna-Pocillopora_meandrina_HIv1___RNAseq.g20902.t1;Parent=gene-Pocillopora_meandrina_HIv1___RNAseq.g20902.t1" "ID=cds-Pocillopora_meandrina_HIv1___RNAseq.g20902.t1;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g20902.t1" "ID=exon-Pocillopora_meandrina_HIv1___RNAseq.g20902.t1-1;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g20902.t1" ...
# Get and print unique types, one per line
cat("Unique feature types in final GFF3:\n")
Unique feature types in final GFF3:
intermediate_gff %>%
distinct(type) %>%
pull(type) %>%
walk(~cat("-", .x, "\n"))
- gene
- mRNA
- CDS
- exon
write_delim(intermediate_gff,
"../data/intermediate.gff3",
delim="\t",
col_names=FALSE)
Validate GFF using genometools gff3
.
-tidy
: Attempts to clean/fix any potential issues.
-checkids
: Checks IDs.
-retainids
: Retains IDs from input GFF instead of assigning new ones.
source .bashvars
gt gff3 \
-tidy \
-checkids \
-retainids \
"${data_dir}"/"${intermediate_gff}" \
> "${data_dir}"/"${validated_gff}" \
2> "${data_dir}"/gt_gff3_validation_errors.log
Process would stop if error occurred, so only need to check end of file.
Warnings are expected, as this program warns the user when it is inserting text.
In this instance, when it is inserting GFF3-compliant comment lines denoting regions.
source .bashvars
tail "${data_dir}"/gt_gff3_validation_errors.log
warning: seqid "Pocillopora_meandrina_HIv1___xfSc0001231" on line 480507 in file "/home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/intermediate.gff3" has not been previously introduced with a "##sequence-region" line, create such a line automatically
warning: seqid "Pocillopora_meandrina_HIv1___xpSc0001273" on line 480513 in file "/home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/intermediate.gff3" has not been previously introduced with a "##sequence-region" line, create such a line automatically
warning: seqid "Pocillopora_meandrina_HIv1___xpSc0001275" on line 480595 in file "/home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/intermediate.gff3" has not been previously introduced with a "##sequence-region" line, create such a line automatically
warning: seqid "Pocillopora_meandrina_HIv1___xpSc0001276" on line 480599 in file "/home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/intermediate.gff3" has not been previously introduced with a "##sequence-region" line, create such a line automatically
warning: seqid "Pocillopora_meandrina_HIv1___xpSc0001280" on line 480627 in file "/home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/intermediate.gff3" has not been previously introduced with a "##sequence-region" line, create such a line automatically
warning: seqid "Pocillopora_meandrina_HIv1___xpSc0001281" on line 480643 in file "/home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/intermediate.gff3" has not been previously introduced with a "##sequence-region" line, create such a line automatically
warning: seqid "Pocillopora_meandrina_HIv1___xpSc0001290" on line 480673 in file "/home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/intermediate.gff3" has not been previously introduced with a "##sequence-region" line, create such a line automatically
warning: seqid "Pocillopora_meandrina_HIv1___xpSc0001321" on line 480725 in file "/home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/intermediate.gff3" has not been previously introduced with a "##sequence-region" line, create such a line automatically
warning: seqid "Pocillopora_meandrina_HIv1___xpSc0001331" on line 480737 in file "/home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/intermediate.gff3" has not been previously introduced with a "##sequence-region" line, create such a line automatically
warning: seqid "Pocillopora_meandrina_HIv1___xpSc0001355" on line 480747 in file "/home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/intermediate.gff3" has not been previously introduced with a "##sequence-region" line, create such a line automatically
source .bashvars
head "${data_dir}"/"${validated_gff}"
##gff-version 3
##sequence-region Pocillopora_meandrina_HIv1___xfSc0000716 887 39392
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS gene 887 6811 . - . ID=gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS mRNA 887 6811 . - . ID=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1;Parent=gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS CDS 887 973 . - 0 ID=cds-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS exon 887 973 . - 0 ID=exon-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1-1;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS CDS 1828 1882 . - 1 ID=cds-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS exon 1828 1882 . - 1 ID=exon-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1-2;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS CDS 2308 2371 . - 2 ID=cds-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS exon 2308 2371 . - 2 ID=exon-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1-3;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
source .bashvars
rm "${data_dir}"/"${intermediate_gff}"
source .bashvars
${gffread} \
-E \
-T \
"${data_dir}"/"${validated_gff}" \
> "${data_dir}"/"${gtf}" \
2> "${data_dir}"/gffread-gtf.log
source .bashvars
head "${data_dir}"/"${gtf}"
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS transcript 887 6811 . - . transcript_id "mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"; gene_id "gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS exon 887 973 . - . transcript_id "mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"; gene_id "gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1";
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS exon 1828 1882 . - . transcript_id "mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"; gene_id "gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1";
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS exon 2308 2371 . - . transcript_id "mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"; gene_id "gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1";
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS exon 2891 2920 . - . transcript_id "mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"; gene_id "gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1";
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS exon 3013 3067 . - . transcript_id "mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"; gene_id "gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1";
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS exon 3369 3425 . - . transcript_id "mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"; gene_id "gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1";
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS exon 3790 3855 . - . transcript_id "mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"; gene_id "gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1";
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS exon 4639 4737 . - . transcript_id "mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"; gene_id "gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1";
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS exon 4985 5020 . - . transcript_id "mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"; gene_id "gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1";
source .bashvars
head "${data_dir}"/gffread-gtf.log
Command line was:
/home/sam/programs/gffread/gffread -E -T /home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/Pocillopora_meandrina_HIv1.genes-validated.gff3
.. loaded 31840 genomic features from /home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/Pocillopora_meandrina_HIv1.genes-validated.gff3