This notebook reformats the original P.tuahiniensis GFF (Pocillopora_meandrina_HIv1.genes.gff3), to be compliant with the GFF3 standard. The genome was originally downloaded from here:
Despite the naming convention of the file, there are no designated gene regions. The reformatting will add a gene feature, based solely on the transcript coordinates. There aren’t any other features beyoned CDS and exon, so adding gene features which match the transcript regions is primarily just to meet the GFF3 standard.
Additionally, this notebook will generate a GTF for downstream use with HiSat2 for creating a genome index which incorporates exon/intron junctions and splice sites.
Unlike other scripts, this will output to F-Ptuh/data, instead of an output directory in ../output.
Requires genometools (GitHub repo) to be installed and in the system $PATH and is used for GFF3 validation/formatting.
Requires gffread (Pertea and Pertea 2020) which is used for GFF3-to-GTF conversion.
library(tidyverse)
This allows usage of Bash variables across R Markdown chunks.
{
echo "#### Assign Variables ####"
echo ""
echo "# Programs"
echo 'export gffread=~/programs/gffread/gffread'
echo "# Data directories"
echo 'export repo_dir=~/gitrepos/urol-e5/deep-dive-expression'
echo 'export data_dir=${repo_dir}/F-Ptuh/data'
echo ""
echo "# Input files"
echo 'export original_gff="Pocillopora_meandrina_HIv1.genes.gff3"'
echo 'export intermediate_gff="intermediate.gff3"'
echo 'export validated_gff="Pocillopora_meandrina_HIv1.genes-validated.gff3"'
echo 'export gtf="Pocillopora_meandrina_HIv1.genes-validated.gtf"'
echo ""
echo "# Print formatting"
echo 'export line="--------------------------------------------------------"'
echo ""
} > .bashvars
cat .bashvars
#### Assign Variables ####
# Programs
export gffread=~/programs/gffread/gffread
# Data directories
export repo_dir=~/gitrepos/urol-e5/deep-dive-expression
export data_dir=${repo_dir}/F-Ptuh/data
# Input files
export original_gff="Pocillopora_meandrina_HIv1.genes.gff3"
export intermediate_gff="intermediate.gff3"
export validated_gff="Pocillopora_meandrina_HIv1.genes-validated.gff3"
export gtf="Pocillopora_meandrina_HIv1.genes-validated.gtf"
# Print formatting
export line="--------------------------------------------------------"
gff <- read_delim("../data/Pocillopora_meandrina_HIv1.genes.gff3", 
                  delim="\t", 
                  col_names=c("seqid", "source", "type", "start", "end", 
                            "score", "strand", "phase", "attributes"),
                  show_col_types = FALSE)
str(gff)
spc_tbl_ [448,910 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ seqid     : chr [1:448910] "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" ...
 $ source    : chr [1:448910] "AUGUSTUS" "AUGUSTUS" "AUGUSTUS" "AUGUSTUS" ...
 $ type      : chr [1:448910] "transcript" "CDS" "exon" "CDS" ...
 $ start     : num [1:448910] 9649 9649 9649 10225 10225 ...
 $ end       : num [1:448910] 18299 9706 9706 10352 10352 ...
 $ score     : chr [1:448910] "." "." "." "." ...
 $ strand    : chr [1:448910] "-" "-" "-" "-" ...
 $ phase     : chr [1:448910] "." "1" "1" "0" ...
 $ attributes: chr [1:448910] "ID=Pocillopora_meandrina_HIv1___TS.g23774.t1" "Parent=Pocillopora_meandrina_HIv1___TS.g23774.t1" "Parent=Pocillopora_meandrina_HIv1___TS.g23774.t1" "Parent=Pocillopora_meandrina_HIv1___TS.g23774.t1" ...
 - attr(*, "spec")=
  .. cols(
  ..   seqid = col_character(),
  ..   source = col_character(),
  ..   type = col_character(),
  ..   start = col_double(),
  ..   end = col_double(),
  ..   score = col_character(),
  ..   strand = col_character(),
  ..   phase = col_character(),
  ..   attributes = col_character()
  .. )
 - attr(*, "problems")=<externalptr> 
# Get and print unique types, one per line
cat("Unique feature types in original GFF3:\n")
Unique feature types in original GFF3:
gff %>%
  distinct(type) %>%
  pull(type) %>%
  walk(~cat("-", .x, "\n"))
- transcript 
- CDS 
- exon 
# Create gene entries
gene_entries <- gff %>%
  filter(type == "transcript") %>%
  mutate(
    type = "gene",
    attributes = paste0("ID=gene-", str_remove(attributes, "^ID="))
  )
str(gene_entries)
tibble [31,840 × 9] (S3: tbl_df/tbl/data.frame)
 $ seqid     : chr [1:31840] "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" ...
 $ source    : chr [1:31840] "AUGUSTUS" "AUGUSTUS" "AUGUSTUS" "AUGUSTUS" ...
 $ type      : chr [1:31840] "gene" "gene" "gene" "gene" ...
 $ start     : num [1:31840] 9649 36139 58831 73152 93651 ...
 $ end       : num [1:31840] 18299 36387 71330 87762 96614 ...
 $ score     : chr [1:31840] "." "." "." "." ...
 $ strand    : chr [1:31840] "-" "+" "+" "+" ...
 $ phase     : chr [1:31840] "." "." "." "." ...
 $ attributes: chr [1:31840] "ID=gene-Pocillopora_meandrina_HIv1___TS.g23774.t1" "ID=gene-Pocillopora_meandrina_HIv1___RNAseq.g5056.t1" "ID=gene-Pocillopora_meandrina_HIv1___RNAseq.g5057.t1" "ID=gene-Pocillopora_meandrina_HIv1___RNAseq.g5058.t1" ...
# Create mRNA entries
mrna_entries <- gff %>%
  filter(type == "transcript") %>%
  mutate(
    type = "mRNA",
    gene_id = paste0("gene-", str_remove(attributes, "^ID=")),
    attributes = paste0("ID=mrna-", str_remove(attributes, "^ID="), 
                       ";Parent=", gene_id)
  ) %>%
  select(-gene_id)
str(mrna_entries)
tibble [31,840 × 9] (S3: tbl_df/tbl/data.frame)
 $ seqid     : chr [1:31840] "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" ...
 $ source    : chr [1:31840] "AUGUSTUS" "AUGUSTUS" "AUGUSTUS" "AUGUSTUS" ...
 $ type      : chr [1:31840] "mRNA" "mRNA" "mRNA" "mRNA" ...
 $ start     : num [1:31840] 9649 36139 58831 73152 93651 ...
 $ end       : num [1:31840] 18299 36387 71330 87762 96614 ...
 $ score     : chr [1:31840] "." "." "." "." ...
 $ strand    : chr [1:31840] "-" "+" "+" "+" ...
 $ phase     : chr [1:31840] "." "." "." "." ...
 $ attributes: chr [1:31840] "ID=mrna-Pocillopora_meandrina_HIv1___TS.g23774.t1;Parent=gene-Pocillopora_meandrina_HIv1___TS.g23774.t1" "ID=mrna-Pocillopora_meandrina_HIv1___RNAseq.g5056.t1;Parent=gene-Pocillopora_meandrina_HIv1___RNAseq.g5056.t1" "ID=mrna-Pocillopora_meandrina_HIv1___RNAseq.g5057.t1;Parent=gene-Pocillopora_meandrina_HIv1___RNAseq.g5057.t1" "ID=mrna-Pocillopora_meandrina_HIv1___RNAseq.g5058.t1;Parent=gene-Pocillopora_meandrina_HIv1___RNAseq.g5058.t1" ...
Prepends a feature ID (e.g. cds-) to the corresponding attribute ID.
Also increments each exon feature type within each mRNA, starting at `.
# Process remaining features
other_features <- gff %>%
  group_by(seqid, group_id = cumsum(type == "transcript")) %>%
  mutate(
    mrna_id = first(if_else(type == "transcript", 
                           paste0("mrna-", str_remove(attributes, "^ID=")), 
                           NA_character_)),
    # Calculate exon number within each group
    exon_num = if_else(type == "exon",
                      dense_rank(if_else(type == "exon", row_number(), NA_real_)),
                      NA_real_),
    # Format attributes
    attributes = if_else(
      type != "transcript",
      paste0(
        "ID=", tolower(type), "-", str_remove(first(if_else(type == "transcript", 
                                                           attributes, 
                                                           NA_character_)), "^ID="),
        if_else(!is.na(exon_num),
                paste0("-", exon_num), ""),
        ";Parent=", mrna_id
      ),
      attributes
    )
  ) %>%
  ungroup() %>%
  filter(type != "transcript") %>%
  select(-group_id, -mrna_id, -exon_num)
str(other_features)
tibble [417,070 × 9] (S3: tbl_df/tbl/data.frame)
 $ seqid     : chr [1:417070] "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" "Pocillopora_meandrina_HIv1___Sc0000007" ...
 $ source    : chr [1:417070] "AUGUSTUS" "AUGUSTUS" "AUGUSTUS" "AUGUSTUS" ...
 $ type      : chr [1:417070] "CDS" "exon" "CDS" "exon" ...
 $ start     : num [1:417070] 9649 9649 10225 10225 11330 ...
 $ end       : num [1:417070] 9706 9706 10352 10352 11655 ...
 $ score     : chr [1:417070] "." "." "." "." ...
 $ strand    : chr [1:417070] "-" "-" "-" "-" ...
 $ phase     : chr [1:417070] "1" "1" "0" "0" ...
 $ attributes: chr [1:417070] "ID=cds-Pocillopora_meandrina_HIv1___TS.g23774.t1;Parent=mrna-Pocillopora_meandrina_HIv1___TS.g23774.t1" "ID=exon-Pocillopora_meandrina_HIv1___TS.g23774.t1-1;Parent=mrna-Pocillopora_meandrina_HIv1___TS.g23774.t1" "ID=cds-Pocillopora_meandrina_HIv1___TS.g23774.t1;Parent=mrna-Pocillopora_meandrina_HIv1___TS.g23774.t1" "ID=exon-Pocillopora_meandrina_HIv1___TS.g23774.t1-2;Parent=mrna-Pocillopora_meandrina_HIv1___TS.g23774.t1" ...
intermediate_gff <- bind_rows(
  gene_entries,
  mrna_entries,
  other_features
) %>%
  arrange(seqid, start)
str(intermediate_gff)
tibble [480,750 × 9] (S3: tbl_df/tbl/data.frame)
 $ seqid     : chr [1:480750] "Pocillopora_meandrina_HIv1___Sc0000000" "Pocillopora_meandrina_HIv1___Sc0000000" "Pocillopora_meandrina_HIv1___Sc0000000" "Pocillopora_meandrina_HIv1___Sc0000000" ...
 $ source    : chr [1:480750] "AUGUSTUS" "AUGUSTUS" "AUGUSTUS" "AUGUSTUS" ...
 $ type      : chr [1:480750] "gene" "mRNA" "CDS" "exon" ...
 $ start     : num [1:480750] 10771 10771 10771 10771 12784 ...
 $ end       : num [1:480750] 23652 23652 11117 11117 12875 ...
 $ score     : chr [1:480750] "." "." "." "." ...
 $ strand    : chr [1:480750] "+" "+" "+" "+" ...
 $ phase     : chr [1:480750] "." "." "0" "0" ...
 $ attributes: chr [1:480750] "ID=gene-Pocillopora_meandrina_HIv1___RNAseq.g20902.t1" "ID=mrna-Pocillopora_meandrina_HIv1___RNAseq.g20902.t1;Parent=gene-Pocillopora_meandrina_HIv1___RNAseq.g20902.t1" "ID=cds-Pocillopora_meandrina_HIv1___RNAseq.g20902.t1;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g20902.t1" "ID=exon-Pocillopora_meandrina_HIv1___RNAseq.g20902.t1-1;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g20902.t1" ...
# Get and print unique types, one per line
cat("Unique feature types in final GFF3:\n")
Unique feature types in final GFF3:
intermediate_gff %>%
  distinct(type) %>%
  pull(type) %>%
  walk(~cat("-", .x, "\n"))
- gene 
- mRNA 
- CDS 
- exon 
write_delim(intermediate_gff, 
            "../data/intermediate.gff3", 
            delim="\t",
            col_names=FALSE)
Validate GFF using genometools gff3.
-tidy: Attempts to clean/fix any potential issues.
-checkids: Checks IDs.
-retainids: Retains IDs from input GFF instead of assigning new ones.
source .bashvars
gt gff3 \
-tidy \
-checkids \
-retainids \
"${data_dir}"/"${intermediate_gff}" \
> "${data_dir}"/"${validated_gff}" \
2> "${data_dir}"/gt_gff3_validation_errors.log
Process would stop if error occurred, so only need to check end of file.
Warnings are expected, as this program warns the user when it is inserting text.
In this instance, when it is inserting GFF3-compliant comment lines denoting regions.
source .bashvars
tail "${data_dir}"/gt_gff3_validation_errors.log
warning: seqid "Pocillopora_meandrina_HIv1___xfSc0001231" on line 480507 in file "/home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/intermediate.gff3" has not been previously introduced with a "##sequence-region" line, create such a line automatically
warning: seqid "Pocillopora_meandrina_HIv1___xpSc0001273" on line 480513 in file "/home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/intermediate.gff3" has not been previously introduced with a "##sequence-region" line, create such a line automatically
warning: seqid "Pocillopora_meandrina_HIv1___xpSc0001275" on line 480595 in file "/home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/intermediate.gff3" has not been previously introduced with a "##sequence-region" line, create such a line automatically
warning: seqid "Pocillopora_meandrina_HIv1___xpSc0001276" on line 480599 in file "/home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/intermediate.gff3" has not been previously introduced with a "##sequence-region" line, create such a line automatically
warning: seqid "Pocillopora_meandrina_HIv1___xpSc0001280" on line 480627 in file "/home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/intermediate.gff3" has not been previously introduced with a "##sequence-region" line, create such a line automatically
warning: seqid "Pocillopora_meandrina_HIv1___xpSc0001281" on line 480643 in file "/home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/intermediate.gff3" has not been previously introduced with a "##sequence-region" line, create such a line automatically
warning: seqid "Pocillopora_meandrina_HIv1___xpSc0001290" on line 480673 in file "/home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/intermediate.gff3" has not been previously introduced with a "##sequence-region" line, create such a line automatically
warning: seqid "Pocillopora_meandrina_HIv1___xpSc0001321" on line 480725 in file "/home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/intermediate.gff3" has not been previously introduced with a "##sequence-region" line, create such a line automatically
warning: seqid "Pocillopora_meandrina_HIv1___xpSc0001331" on line 480737 in file "/home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/intermediate.gff3" has not been previously introduced with a "##sequence-region" line, create such a line automatically
warning: seqid "Pocillopora_meandrina_HIv1___xpSc0001355" on line 480747 in file "/home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/intermediate.gff3" has not been previously introduced with a "##sequence-region" line, create such a line automatically
source .bashvars
head "${data_dir}"/"${validated_gff}"
##gff-version 3
##sequence-region   Pocillopora_meandrina_HIv1___xfSc0000716 887 39392
Pocillopora_meandrina_HIv1___xfSc0000716    AUGUSTUS    gene    887 6811    .   -   .   ID=gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716    AUGUSTUS    mRNA    887 6811    .   -   .   ID=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1;Parent=gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716    AUGUSTUS    CDS 887 973 .   -   0   ID=cds-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716    AUGUSTUS    exon    887 973 .   -   0   ID=exon-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1-1;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716    AUGUSTUS    CDS 1828    1882    .   -   1   ID=cds-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716    AUGUSTUS    exon    1828    1882    .   -   1   ID=exon-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1-2;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716    AUGUSTUS    CDS 2308    2371    .   -   2   ID=cds-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716    AUGUSTUS    exon    2308    2371    .   -   2   ID=exon-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1-3;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
source .bashvars
rm "${data_dir}"/"${intermediate_gff}"
source .bashvars
${gffread} \
-E \
-T \
"${data_dir}"/"${validated_gff}" \
> "${data_dir}"/"${gtf}" \
2> "${data_dir}"/gffread-gtf.log
source .bashvars
head "${data_dir}"/"${gtf}"
Pocillopora_meandrina_HIv1___xfSc0000716    AUGUSTUS    transcript  887 6811    .   -   .   transcript_id "mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"; gene_id "gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"
Pocillopora_meandrina_HIv1___xfSc0000716    AUGUSTUS    exon    887 973 .   -   .   transcript_id "mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"; gene_id "gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1";
Pocillopora_meandrina_HIv1___xfSc0000716    AUGUSTUS    exon    1828    1882    .   -   .   transcript_id "mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"; gene_id "gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1";
Pocillopora_meandrina_HIv1___xfSc0000716    AUGUSTUS    exon    2308    2371    .   -   .   transcript_id "mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"; gene_id "gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1";
Pocillopora_meandrina_HIv1___xfSc0000716    AUGUSTUS    exon    2891    2920    .   -   .   transcript_id "mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"; gene_id "gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1";
Pocillopora_meandrina_HIv1___xfSc0000716    AUGUSTUS    exon    3013    3067    .   -   .   transcript_id "mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"; gene_id "gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1";
Pocillopora_meandrina_HIv1___xfSc0000716    AUGUSTUS    exon    3369    3425    .   -   .   transcript_id "mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"; gene_id "gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1";
Pocillopora_meandrina_HIv1___xfSc0000716    AUGUSTUS    exon    3790    3855    .   -   .   transcript_id "mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"; gene_id "gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1";
Pocillopora_meandrina_HIv1___xfSc0000716    AUGUSTUS    exon    4639    4737    .   -   .   transcript_id "mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"; gene_id "gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1";
Pocillopora_meandrina_HIv1___xfSc0000716    AUGUSTUS    exon    4985    5020    .   -   .   transcript_id "mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1"; gene_id "gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1";
source .bashvars
head "${data_dir}"/gffread-gtf.log
Command line was:
/home/sam/programs/gffread/gffread -E -T /home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/Pocillopora_meandrina_HIv1.genes-validated.gff3
   .. loaded 31840 genomic features from /home/sam/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/data/Pocillopora_meandrina_HIv1.genes-validated.gff3