#!/usr/bin/env python3 """ > annotate_bismark_cov.py < Script calculates the distribution of methylation over entire genic/intergenic regions, and prints out: - Gene name, if in a genic region; "intergenic", if outside. - How far the methylated position is along the genic/intergenic region, in % terms. - How far the methylated position is from the 5' end of the genic/intergenic region. Values range from +1 (right splat on 5' edge) to len(region). - How far the methylated position is from the 3' end of the genic/intergenic region. Values range from -1 (right splat on 3' edge) to -len(region). - Same information for exons/introns as for genes, when present. """ import argparse import csv import re import sys import time import numpy as np import natural_sort import parse_fasta import parse_gff3 parser = argparse.ArgumentParser(description=""" Script calculates the distribution of methylation over entire genic/intergenic regions, and prints out lotsa details.""") parser.add_argument('genome_fasta', metavar='fasta_file', type=argparse.FileType('r'), help='genome FASTA file.') parser.add_argument('genome_gff3', metavar='gff3_file', type=argparse.FileType('r'), help='genome GFF3 annotation.') parser.add_argument('bismark_cov', metavar="cov_filename", type=argparse.FileType('r'), nargs='?', default=sys.stdin, help="Bismark .cov filename.") parser.add_argument('-v', '--verbose', action='store_true', help='prints diagnostic stuff to stderr.') args = parser.parse_args() def is_numpy_array_filled(np_array, coords): return np.any(np_array[min(coords):max(coords)]) def assign_value_to_numpy_array(np_array, coords, assigned_value): """ Coords are in a tuple (coords1, coords2). +ve and -ve values are assigned respectively, depending on coords1 < coords2 or vice versa. """ if coords[1] > coords[0]: np_array[coords[0]:coords[1]] = assigned_value elif coords[0] > coords[1]: np_array[coords[1]:coords[0]] = -assigned_value return np_array def calc_coord_length(gene_coords): """ Given a coord like [10,20], return the length of the sub-sequence (10). """ return max(gene_coords) - min(gene_coords) def calc_displacement(seq, coords, return_type='all'): """ Displacement: given a chromosomal coordinate, calculate how far it is from the start of the gene. <-------| 0 0 0 0 0 1 1 1 1 1 1 1 0 0 where 1 denotes genic regions | | | | s_coord e_coord "seq" can be NumPy arrays or lists. Coordinates are zero-based (start, end) lists. Make sure individual positions are converted to lists before calling the def! e.g. 4 --> [3, 4] Using tuples allows for checking location of k-mers in a longer n-mer. e.g. "BC" in "ABCDE" --> 2nd dimer out of 4 possible dimers and allows for specifying - strand coordinates: e.g. [4, 3] Resulting value can either be "percentage" (coords / total length of gene) or "absolute" (coords - start of gene) """ # sanity checking inputs assert len(coords) == 2, "make sure 'coords' is a list!" assert return_type in ['all', 'percentage', 'absolute', 'absolute_inverse'], \ "return_type != 'all', 'percentage', 'absolute' or 'absolute_inverse'." if return_type == 'all': return_type = ['percentage', 'absolute', 'absolute_inverse'] # go upstream (scaffold's perspective) first s_coord = min(coords) if s_coord > 0: while seq[s_coord-1] == seq[min(coords)]: s_coord -= 1 if s_coord == 0: break # then downstream (scaffold's perspective) e_coord = max(coords) if e_coord < len(seq): while seq[e_coord] == seq[max(coords)-1]: e_coord += 1 if e_coord == len(seq): break feature_coord = [s_coord, e_coord] # if original start coord > end coord, flip the coords of feature_coord, # so that the 3' end becomes the "head" of the feature and the 5' end # becomes the "tail" of the feature if coords[0] > coords[1]: feature_coord = [-feature_coord[1], -feature_coord[0]] coords = [-coords[0], -coords[1]] returned_list = [] if 'percentage' in return_type: # takes the HALFWAY point within coords, and checks how far it is from # the 5' end # range of values returned: (0, 1) loc_pct = ((coords[0] + coords[1]) / 2 - feature_coord[0]) / \ (calc_coord_length(feature_coord)) returned_list.append(round(loc_pct, 5)) if 'absolute' in return_type: # range of values returned: [0, len(feature_coord)) returned_list.append(coords[0] - feature_coord[0]) if 'absolute_inverse' in return_type: # range of values returned: (-len(feature_coord), -1] returned_list.append(coords[1] - feature_coord[1] - 1) return [str(x) for x in returned_list] def translate_exon_info(ei_integer, gene_id): """ Change integers stored in NumPy array into strings, e.g.: 1 --> Exon_1 6 --> Intron_3 Also return info in the negative form (i.e. Exon_-1 is the last exon, Exon_-2 is penultimate exon...). """ # in rare cases, the longest transcript in a gene is shorter than the # gene itself. Thus, there's a possibility that # gene_info != AND exon_info == 0 !! if not ei_integer: return 'no_info' ei_integer = abs(int(ei_integer)) # NumPy ints aren't... really ints. ei = 'Exon' if ei_integer % 2 else 'Intron' # round() is dangerous. Banker's rounding. ei_number = (ei_integer + 1) // 2 ei_inverse = ei_number - exon_count[gene_id] - (ei_integer % 2) return '_'.join([ei, str(ei_number), str(ei_inverse)]) # read sequences sequence_lengths = parse_fasta.get_all_sequences(args.genome_fasta, 'fasta', lengths_only=True) if args.verbose: print ('[{}] Lengths for {} sequences parsed.'.format( time.asctime(), len(sequence_lengths)), file=sys.stderr) # read coordinates of genes and exons from .gff3 file. scaffold_gff3 = parse_gff3.parse_gff3(args.genome_gff3, 'exon') # as genes might contain overlapping isoforms, the longest isoform is chosen, # if multiples exist. scaffold_gff3 = parse_gff3.pick_longest_mRNA(scaffold_gff3) # make sure features in all mRNAs are sorted properly (for exon numbering). scaffold_gff3 = parse_gff3.sort_features(scaffold_gff3) # genic regions are denoted in a NumPy array as follows: # 0: intergenic region # +ve: gene in the + strand # -ve: gene in the - strand # # gene_names is a dict that converts ints in gene_info back to gene names. # # NOTE: if genes overlap, the first one that gets annotated (sorted # alphabetically) has priority. gene_info = {} gene_names = {} gene_counter = 0 # similar to genic data, exonic data is stored in a NumPy array. However, # exons/introns in all genes have the following numbering scheme: # 0: intergenic region # 1, 2: exon 1, intron 1 # 3, 4: exon 2, intron 2, ... # +ve: gene in the + strand # -ve: gene in the - strand # # exon_count stores # of exons in that gene. exon_info = {} exon_count = {} for s in natural_sort.natural_sort(scaffold_gff3): gene_info[s] = np.zeros(sequence_lengths[s], np.int32) exon_info[s] = np.zeros(sequence_lengths[s], np.int8) for gene_id in natural_sort.natural_sort(scaffold_gff3[s]): # gene detail annotation gene_coords = scaffold_gff3[s][gene_id].coords # jump out of loop if the assigned region is not zeroes ([0, 0, ...]). if is_numpy_array_filled(gene_info[s], gene_coords): continue gene_counter += 1 gene_info[s] = assign_value_to_numpy_array(gene_info[s], gene_coords, gene_counter) gene_names[gene_counter] = gene_id # exon detail annotation # # sole_mRNA is a mRNA object, with the assumption that *.mRNAs # returns a dict containing {mRNA_name: mRNA_object} sole_mRNA = list(scaffold_gff3[s][gene_id].mRNAs.values())[0] exon_coords = sole_mRNA.details['exon'] exon_count[gene_id] = len(exon_coords) unpacked_coords = [y for x in exon_coords for y in x] exon_intron_coords = [(x, y) for x, y in zip(unpacked_coords[:-1], unpacked_coords[1:])] for n, ei in enumerate(exon_intron_coords): exon_info[s] = assign_value_to_numpy_array(exon_info[s], ei, n+1) if args.verbose: print ('[{}] {} sequences with gene annotations parsed.'.format( time.asctime(), len(scaffold_gff3)), file=sys.stderr) print ('[{}] {} genes parsed in total.'.format( time.asctime(), gene_counter), file=sys.stderr) # read per-position # of methylated and unmethylated bases tsv_reader = csv.reader(args.bismark_cov, delimiter='\t') counter = 0 for row in tsv_reader: # skip empty rows if not row: continue # to make script work on annotated bismark covs (i.e. overwrite previous # annotation), retain only first 6 columns row = row[:6] # skip lines with scaffold names not in the gff3 file scaf = row[0] if scaf not in gene_info: print ('\t'.join(row), 'no_info', sep='\t') continue # all systems go! start = int(row[1]) - 1 end = int(row[2]) meth = int(row[4]) unmeth = int(row[5]) # add gene info first extra_info = [] if gene_info[scaf][start] == 0: extra_info.append('intergenic') intergenic_info = calc_displacement(gene_info[scaf], [start, end]) extra_info += intergenic_info # get info regarding regions flanking the intergenic region: # scaf_start: 5' end of the intergenic region is start of scaffold # scaf_end: 3' end of the intergenic region is end of scaffold # upstream_watson: 5' end of intergenic region is 3' end of a gene # upstream_crick: 5' end of intergenic region is 5' end of a gene # downstream_watson: 3' end of intergenic region is 5' end of a gene # downstream_crick: 3' end of intergenic region is 3' end of a gene fivep_dist = int(intergenic_info[1]) threep_dist = abs(int(intergenic_info[2])) if fivep_dist == start: fivep_info = 'scaf_start' elif gene_info[scaf][start - fivep_dist - 1] > 0: gene_id = gene_names[gene_info[scaf][start - fivep_dist - 1]] fivep_info = 'upstream_watson|' + gene_id elif gene_info[scaf][start - fivep_dist - 1] < 0: gene_id = gene_names[abs(gene_info[scaf][start - fivep_dist - 1])] fivep_info = 'upstream_crick|' + gene_id else: # if this is triggered, serious debugging is needed print ("[{}] invalid fivep_info value encountered!".format( time.asctime()), file=sys.stderr) raise SystemExit() if threep_dist + start == sequence_lengths[scaf]: threep_info = 'scaf_end' elif gene_info[scaf][start + threep_dist] > 0: gene_id = gene_names[gene_info[scaf][start + threep_dist]] threep_info = 'downstream_watson|' + gene_id elif gene_info[scaf][start + threep_dist] < 0: gene_id = gene_names[abs(gene_info[scaf][start + threep_dist])] threep_info = 'downstream_crick|' + gene_id else: # if this is triggered, serious debugging is needed print ("[{}] invalid threep_info value encountered!".format( time.asctime()), file=sys.stderr) raise SystemExit() extra_info += [fivep_info, threep_info] else: gene_id = gene_names[abs(gene_info[scaf][start])] extra_info.append(gene_id) if gene_info[scaf][start] > 0: extra_info += calc_displacement(gene_info[scaf], [start, end]) elif gene_info[scaf][start] < 0: extra_info += calc_displacement(gene_info[scaf], [end, start]) # add exon info. ignore intergenic stuff - no exon/intron there. extra_info.append(translate_exon_info(exon_info[scaf][start], gene_id)) if exon_info[scaf][start] > 0: extra_info += calc_displacement(exon_info[scaf], [start, end]) elif exon_info[scaf][start] < 0: extra_info += calc_displacement(exon_info[scaf], [end, start]) # fix unneeded precision in meth %. output = row[:3] + [str(round(meth/(meth+unmeth) * 100, 4))] + row[4:6] print ('\t'.join(output + extra_info)) if args.verbose: counter += 1 if counter % 10000 == 0: print ('[{}] {} methylated positions processed...'.format( time.asctime(), counter), file=sys.stderr)