# cython: profile=False, emit_code_comments=False, language_level=3 """ Quality trimming. """ cdef extern from *: unsigned char * PyUnicode_1BYTE_DATA(object o) int PyUnicode_KIND(object o) int PyUnicode_1BYTE_KIND cdef class HasNoQualities(Exception): pass def quality_trim_index(str qualities, int cutoff_front, int cutoff_back, int base=33): """ Find the positions at which to trim low-quality ends from a nucleotide sequence. Return tuple (start, stop) that indicates the good-quality segment. Qualities are assumed to be ASCII-encoded as chr(qual + base). The algorithm is the same as the one used by BWA within the function 'bwa_trim_read': - Subtract the cutoff value from all qualities. - Compute partial sums from all indices to the end of the sequence. - Trim sequence at the index at which the sum is minimal. """ if qualities is None: raise HasNoQualities("Cannot do quality trimming when no qualities are available") cdef: int s int max_qual int n = len(qualities) int stop = n int start = 0 int i char* qual if not PyUnicode_KIND(qualities) == PyUnicode_1BYTE_KIND: raise ValueError("Quality data is not ASCII") qual = PyUnicode_1BYTE_DATA(qualities) # find trim position for 5' end s = 0 max_qual = 0 for i in range(n): s += cutoff_front - (qual[i] - base) if s < 0: break if s > max_qual: max_qual = s start = i + 1 # same for 3' end max_qual = 0 s = 0 for i in reversed(range(n)): s += cutoff_back - (qual[i] - base) if s < 0: break if s > max_qual: max_qual = s stop = i if start >= stop: start, stop = 0, 0 return (start, stop) def nextseq_trim_index(sequence, int cutoff, int base=33): """ Variant of the above quality trimming routine that works on NextSeq data. With Illumina NextSeq, bases are encoded with two colors. 'No color' (a dark cycle) usually means that a 'G' was sequenced, but that also occurs when sequencing falls off the end of the fragment. The read then contains a run of high-quality G bases in the end. This routine works as the one above, but counts qualities belonging to 'G' bases as being equal to cutoff - 1. """ bases = sequence.sequence qualities = sequence.qualities if qualities is None: raise HasNoQualities() cdef: int s = 0 int max_qual = 0 int max_i int i, q char* qual if not PyUnicode_KIND(qualities) == PyUnicode_1BYTE_KIND: raise ValueError("Quality data is not ASCII") qual = PyUnicode_1BYTE_DATA(qualities) s = 0 max_qual = 0 max_i = len(qualities) for i in reversed(range(max_i)): q = qual[i] - base if bases[i] == 'G': q = cutoff - 1 s += cutoff - q if s < 0: break if s > max_qual: max_qual = s max_i = i return max_i def expected_errors(str qualities, int base=33): """ Return the number of expected errors (as double) from a read’s qualities. This uses the formula in Edgar et al. (2015), see Section 2.2 in . qualities -- ASCII-encoded qualities (chr(qual + base)) """ cdef: int i, q bytes quals = qualities.encode() char* cq = quals double e = 0.0 for i in range(len(qualities)): q = cq[i] - base e += 10 ** (-q / 10) return e