#!/usr/bin/env python3

"""
> parse_gff3.py <

Python script intends to be a helper function that can be called by other
scripts to handle gff annotations.

Script requires a _file object_ (not filename).

The output is in the form of:
    dict[seqid] = {five_prime_UTR: {ID001: (start_coord1, end_coord1), ...}
                     three_prime_UTR: {ID001: (start_coord2, end_coord2), ...}
                     gene: ...
                     exon: ...
                     CDS: ...
                     ...: ...}
"""

# from http://genome.ucsc.edu/FAQ/FAQformat.html#format3:
# Here is a brief description of the GFF fields:
#   seqid - The name of the sequence. Must be a chromosome or scaffold. 
#   source - The program that generated this feature. 
#   feature - The name of this type of feature. Some examples of standard
#             feature types are "CDS", "start_codon", "stop_codon", and "exon". 
#   start - The starting position of the feature in the sequence. The first base
#           is numbered 1. 
#   end - The ending position of the feature (inclusive). 
#   score - A score between 0 and 1000. If the track line useScore attribute is
#           set to 1 for this annotation data set, the score value will
#           determine the level of gray in which this feature is displayed
#           (higher numbers = darker gray). If there is no score value, enter ".". 
#   strand - Valid entries include '+', '-', or '.' (for don't know/don't care). 
#   frame - If the feature is a coding exon, frame should be a number between
#           0-2 that represents the reading frame of the first base. If the
#           feature is not a coding exon, the value should be '.'. 
#   group - All lines with the same group are linked together into a single item.

# sample line:
#   RNA-1;1.gff:1000131     maker   five_prime_UTR  1968    1999    .   
#   +       .       ID=maker-1000131-exonerate...

# IMPORTANT NOTE ABOUT startpos AND endpos: the biological definition of
# position (1-based) is BAD at dealing with the edge case of startpos == endpos.
# If startpos == endpos, then there's no way to tell whether it's on the 
# '+' strand or the '-' strand.
# Thus, startpos and endpos produced by the parse_gff3 def follows the Python
# convention (starting base is sequence[0:1]), which allows for the
# discrimination of '+' and '-' strands.
#   e.g. if startpos = 2 and endpos = 3: '+' strand    }  both refer to
#        if endpos = 2 and startpos = 3: '-' strand    }  the same base

import csv
import re

class Gene:
    def __init__(self, coords):
        self.coords = coords
        self.mRNAs = {}
    
    def __len__(self):
        return abs(self.coords[1] - self.coords[0])
    
    def add_mRNA(self, mRNA_id, coords):
        """'mRNA' is a class!"""
        self.mRNAs[mRNA_id] = mRNA(coords)

class mRNA:
    def __init__(self, coords):
        self.coords = coords
        self.details = {}
    
    def __len__(self):
        return abs(self.coords[1] - self.coords[0])
    
    def add_feature(self, feature_id, coords):
        """coords is a tuple!"""
        if feature_id not in self.details:
            self.details[feature_id] = []
        
        self.details[feature_id].append(coords)

def natural_sort(input_list):
    tryint = lambda x: int(x) if x.isdigit() else x
    chunked_text = lambda x: [tryint(y) for y in re.split('([0-9]+)', x)]
    sorted_list = sorted(input_list, key=chunked_text)

    return sorted_list

def calc_total_exonic_length(list_of_exon_coords):
    """
    Given a bunch of exonic coordinates, calculates the total length.
    """
    return sum([max(x) - min(x) for x in list_of_exon_coords])
    
    
def get_attribute(gff3_row, attribute):
    attr_search = re.search('{}=(.*?)(;|$)'.format(attribute), gff3_row[8])
    if attr_search:
        return attr_search.group(1)
    else:
        raise AttributeError("'{}' does not contain '{}='.".format(
            '\t'.join(gff3_row), attribute))

def get_coords(gff3_row):
    """
    Returned coordinates are in the tuple (startpos, endpos).
    startpos/endpos are 0-based, not 1-based.
    """
    
    strand = gff3_row[6]
    if strand == '+':
        startpos = int(gff3_row[3]) - 1
        endpos = int(gff3_row[4])
    elif strand == '-':
        startpos = int(gff3_row[4])
        endpos = int(gff3_row[3]) - 1
    else:
        print ('Error: "{}" has no valid strand information.'\
            .format('\t'.join(gff3_row)))
        raise SystemExit()
        
    return (startpos, endpos)

def pick_longest_mRNA(gff_details):
    """
    Given mRNAs contained in Genes, retain only the longest mRNA per gene.
    
    The longest mRNA is the mRNA containing the longest total exonic length,
    not raw mRNA length.
    """
    for s in gff_details:
        for g in gff_details[s]:
            # for the sort, keys are sorted by longest exonic length in reverse.
            # in case of ties, the longest mRNA (by length) is picked.
            sorted_mRNA = sorted(gff_details[s][g].mRNAs, 
                    key=lambda x: (-calc_total_exonic_length(
                            gff_details[s][g].mRNAs[x].details['exon']), 
                    -len(gff_details[s][g].mRNAs[x])))
            
            # eliminate all other keys except for the longest mRNA.
            gff_details[s][g].mRNAs = {sorted_mRNA[0]: 
                                       gff_details[s][g].mRNAs[sorted_mRNA[0]]}
            
    return gff_details
    
def sort_features(gff_details):
    """
    Makes sure that feature coordinates are sorted in the right order
    (i.e. ascending order for genes in the 5'-3' order; descending for
    genes that are in the 3'-5' order).
    """
    for s in gff_details:
        for g in gff_details[s]:
            for m in gff_details[s][g].mRNAs:
                for f in gff_details[s][g].mRNAs[m].details:
                    # gff_details[s][g].mRNAs[m].details[f] has data in the
                    # form of coordinates, e.g. [(23559, 22882), (22781, 22387)]
                    coords = gff_details[s][g].mRNAs[m].details[f]
                    
                    if len(coords) == 1: continue
                    
                    desc_order = coords[0][0] > coords[0][1]
                    gff_details[s][g].mRNAs[m].details[f] = \
                        sorted(coords, reverse=desc_order)
    
    return gff_details
    
def parse_gff3(gff_file, select_feature='all'):
    """
    'gff_file' refers to file object containing gff file.
    'select_feature' can be used to select for one or more features of interest
    in the gff file (e.g. "three_prime_UTR", ['three_prime_UTR', 'five_prime_UTR'])
    
    NOTE: all variables are named according to convention!
    """
    gff_details = {}
    
    tsv_reader = csv.reader(gff_file, delimiter='\t')
    for row in tsv_reader:
        # ignore blank lines and comments (lines starting with '#')
        if not row: continue
        if row[0].startswith('#'): continue
        
        feature = row[2]
        # ignore lines that do not correspond to the feature wanted (with the
        # exception of gene and mRNA, we always want that information).
        if select_feature == 'all' or feature in ['gene', 'mRNA'] \
                                   or feature in select_feature:
            seqid = row[0]
            
            if seqid not in gff_details:
                gff_details[seqid] = {}
            
            coords = get_coords(row)
            if feature == 'gene':
                gene_id = get_attribute(row, 'ID')
                gff_details[seqid][gene_id] = Gene(coords)
            elif feature == 'mRNA':
                mRNA_id = get_attribute(row, 'ID')
                gff_details[seqid][gene_id].add_mRNA(mRNA_id, coords)
            else:
                # mRNA_ids might be multiple IDs, separated with commas.
                mRNA_ids = get_attribute(row, 'Parent')
                for m in mRNA_ids.split(','):
                    gff_details[seqid][gene_id].mRNAs[m].add_feature(feature, coords)
    
    return gff_details
    
# debug
# if __name__ == '__main__':
    # temp = parse_gff3(open('test.gff3'), select_feature='exon')
    # print (temp['ctg123']['gene00001'].mRNAs)
    # print (pick_longest_mRNA(temp)['ctg123']['gene00001'].mRNAs)