""" Adapter finding and trimming classes The ...Adapter classes are responsible for finding adapters. The ...Match classes trim the reads. """ import logging from enum import Enum from collections import defaultdict from typing import Optional, Tuple, Sequence, Dict, Any, List from abc import ABC, abstractmethod from . import align logger = logging.getLogger() class Where(Enum): # Constants for the Aligner.locate() function. # The function is called with SEQ1 as the adapter, SEQ2 as the read. # TODO get rid of those constants, use strings instead BACK = align.START_WITHIN_SEQ2 | align.STOP_WITHIN_SEQ2 | align.STOP_WITHIN_SEQ1 FRONT = align.START_WITHIN_SEQ2 | align.STOP_WITHIN_SEQ2 | align.START_WITHIN_SEQ1 PREFIX = align.STOP_WITHIN_SEQ2 SUFFIX = align.START_WITHIN_SEQ2 # Just like FRONT/BACK, but without internal matches FRONT_NOT_INTERNAL = align.START_WITHIN_SEQ1 | align.STOP_WITHIN_SEQ2 BACK_NOT_INTERNAL = align.START_WITHIN_SEQ2 | align.STOP_WITHIN_SEQ1 ANYWHERE = align.SEMIGLOBAL LINKED = 'linked' def returns_defaultdict_int(): # We need this function to make EndStatistics picklable. # Even a @staticmethod of EndStatistics is not sufficient # as that is not picklable before Python 3.5. return defaultdict(int) class EndStatistics: """Statistics about the 5' or 3' end""" def __init__(self, adapter: "SingleAdapter"): self.max_error_rate = adapter.max_error_rate # type: float self.sequence = adapter.sequence # type: str self.effective_length = adapter.effective_length # type: int self.has_wildcards = adapter.adapter_wildcards # type: bool # self.errors[l][e] == n iff n times a sequence of length l matching at e errors was removed self.errors = defaultdict(returns_defaultdict_int) # type: Dict[int, Dict[int, int]] self.adjacent_bases = {'A': 0, 'C': 0, 'G': 0, 'T': 0, '': 0} # TODO avoid hard-coding the list of classes self._remove_prefix = isinstance(adapter, (FrontAdapter, NonInternalFrontAdapter, PrefixAdapter)) def __repr__(self): errors = {k: dict(v) for k, v in self.errors.items()} return "EndStatistics(max_error_rate={}, errors={}, adjacent_bases={})".format( self.max_error_rate, errors, self.adjacent_bases, ) def __iadd__(self, other: Any): if not isinstance(other, self.__class__): raise ValueError("Cannot compare") if ( self.max_error_rate != other.max_error_rate or self.sequence != other.sequence or self.effective_length != other.effective_length ): raise RuntimeError('Incompatible EndStatistics, cannot be added') for base in ('A', 'C', 'G', 'T', ''): self.adjacent_bases[base] += other.adjacent_bases[base] for length, error_dict in other.errors.items(): for errors in error_dict: self.errors[length][errors] += other.errors[length][errors] return self @property def lengths(self): d = {length: sum(errors.values()) for length, errors in self.errors.items()} return d def random_match_probabilities(self, gc_content: float) -> List[float]: """ Estimate probabilities that this adapter end matches a random sequence. Indels are not taken into account. Returns a list p, where p[i] is the probability that i bases of this adapter match a random sequence with GC content gc_content. """ seq = self.sequence # FIXME this is broken for 'anywhere' adapters if self._remove_prefix: seq = seq[::-1] allowed_bases = 'CGRYSKMBDHVN' if self.has_wildcards else 'GC' p = 1. probabilities = [p] for i, c in enumerate(seq): if c in allowed_bases: p *= gc_content / 2. else: p *= (1. - gc_content) / 2. probabilities.append(p) return probabilities class AdapterStatistics: """ Statistics about an adapter. An adapter can work on the 5' end (front) or 3' end (back) of a read, and statistics for that are captured separately in EndStatistics objects. """ def __init__( self, adapter: "Adapter", front: "SingleAdapter", back: Optional["SingleAdapter"] = None, ): self.name = adapter.name self.adapter = adapter self.front = EndStatistics(front) if back is None: self.back = EndStatistics(front) else: self.back = EndStatistics(back) self.reverse_complemented = 0 def __repr__(self): return "AdapterStatistics(name={}, front={}, back={})".format( self.name, self.front, self.back, ) def __iadd__(self, other: "AdapterStatistics"): self.front += other.front self.back += other.back self.reverse_complemented += other.reverse_complemented return self class Match(ABC): @abstractmethod def remainder_interval(self) -> Tuple[int, int]: pass @abstractmethod def get_info_records(self, read) -> List[List]: pass @abstractmethod def trimmed(self, read): pass class SingleMatch(Match, ABC): """ Representation of a single adapter matched to a single string """ __slots__ = ['astart', 'astop', 'rstart', 'rstop', 'matches', 'errors', 'adapter', 'sequence', 'length', 'adjacent_base'] def __init__( self, astart: int, astop: int, rstart: int, rstop: int, matches: int, errors: int, adapter: "SingleAdapter", sequence: str, ): self.adjacent_base = "" self.astart = astart # type: int self.astop = astop # type: int self.rstart = rstart # type: int self.rstop = rstop # type: int self.matches = matches # type: int self.errors = errors # type: int self.adapter = adapter # type: SingleAdapter self.sequence = sequence # Number of aligned characters in the adapter. If there are # indels, this may be different from the number of characters # in the read. self.length = astop - astart # type: int def __repr__(self): return 'SingleMatch(astart={}, astop={}, rstart={}, rstop={}, matches={}, errors={})'.format( self.astart, self.astop, self.rstart, self.rstop, self.matches, self.errors) def wildcards(self, wildcard_char: str = "N") -> str: """ Return a string that contains, for each wildcard character, the character that it matches. For example, if the adapter ATNGNA matches ATCGTA, then the string 'CT' is returned. If there are indels, this is not reliable as the full alignment is not available. """ wildcards = [self.sequence[self.rstart + i] for i in range(self.length) if self.adapter.sequence[self.astart + i] == wildcard_char and self.rstart + i < len(self.sequence)] return ''.join(wildcards) def get_info_records(self, read) -> List[List]: seq = read.sequence qualities = read.qualities info = [ "", self.errors, self.rstart, self.rstop, seq[0:self.rstart], seq[self.rstart:self.rstop], seq[self.rstop:], self.adapter.name, ] if qualities: info += [ qualities[0:self.rstart], qualities[self.rstart:self.rstop], qualities[self.rstop:], ] else: info += ["", "", ""] return [info] class RemoveBeforeMatch(SingleMatch): """A match that removes sequence before the match""" def __repr__(self): return 'RemoveBeforeMatch(astart={}, astop={}, rstart={}, rstop={}, matches={}, errors={})'.format( self.astart, self.astop, self.rstart, self.rstop, self.matches, self.errors) def rest(self) -> str: """ Return the part of the read before this match if this is a 'front' (5') adapter, return the part after the match if this is not a 'front' adapter (3'). This can be an empty string. """ return self.sequence[:self.rstart] def remainder_interval(self) -> Tuple[int, int]: """ Return an interval (start, stop) that describes the part of the read that would remain after trimming """ return self.rstop, len(self.sequence) def trim_slice(self): # Same as remainder_interval, but as a slice() object return slice(self.rstop, None) def trimmed(self, read): return read[self.rstop:] def update_statistics(self, statistics: AdapterStatistics): """Update AdapterStatistics in place""" statistics.front.errors[self.rstop][self.errors] += 1 class RemoveAfterMatch(SingleMatch): """A match that removes sequence after the match""" def __repr__(self): return "RemoveAfterMatch(astart={}, astop={}, rstart={}, rstop={}, matches={}, errors={})".format( self.astart, self.astop, self.rstart, self.rstop, self.matches, self.errors) def rest(self) -> str: """ Return the part of the read before this match if this is a 'front' (5') adapter, return the part after the match if this is not a 'front' adapter (3'). This can be an empty string. """ return self.sequence[self.rstop:] def remainder_interval(self) -> Tuple[int, int]: """ Return an interval (start, stop) that describes the part of the read that would remain after trimming """ return 0, self.rstart def trim_slice(self): # Same as remainder_interval, but as a slice() object return slice(None, self.rstart) def trimmed(self, read): return read[:self.rstart] def update_statistics(self, statistics: AdapterStatistics): """Update AdapterStatistics in place""" adjacent_base = self.sequence[self.rstart - 1:self.rstart] statistics.back.errors[len(self.sequence) - self.rstart][self.errors] += 1 try: statistics.back.adjacent_bases[adjacent_base] += 1 except KeyError: statistics.back.adjacent_bases[''] = 1 def _generate_adapter_name(_start=[1]) -> str: name = str(_start[0]) _start[0] += 1 return name class Adapter(ABC): description = "adapter with one component" # this is overriden in subclasses def __init__(self, name: Optional[str], *args, **kwargs): self.name = name @abstractmethod def enable_debug(self): pass @abstractmethod def match_to(self, sequence: str): pass class SingleAdapter(Adapter, ABC): """ This class can find a single adapter characterized by sequence, error rate, type etc. within reads. where -- A Where enum value. This influences where the adapter is allowed to appear within the read. sequence -- The adapter sequence as string. Will be converted to uppercase. Also, Us will be converted to Ts. max_error_rate -- Maximum allowed error rate. The error rate is the number of errors in the alignment divided by the length of the part of the alignment that matches the adapter. minimum_overlap -- Minimum length of the part of the alignment that matches the adapter. read_wildcards -- Whether IUPAC wildcards in the read are allowed. adapter_wildcards -- Whether IUPAC wildcards in the adapter are allowed. name -- optional name of the adapter. If not provided, the name is set to a unique number. """ def __init__( self, sequence: str, max_error_rate: float = 0.1, min_overlap: int = 3, read_wildcards: bool = False, adapter_wildcards: bool = True, name: Optional[str] = None, indels: bool = True, ): super().__init__(name) self._debug = False # type: bool self.name = _generate_adapter_name() if name is None else name # type: str self.sequence = sequence.upper().replace("U", "T") # type: str if not self.sequence: raise ValueError("Adapter sequence is empty") self.max_error_rate = max_error_rate # type: float self.min_overlap = min(min_overlap, len(self.sequence)) # type: int iupac = frozenset('XACGTURYSWKMBDHVN') if adapter_wildcards and not set(self.sequence) <= iupac: for c in self.sequence: if c not in iupac: raise ValueError('Character {!r} in adapter sequence {!r} is ' 'not a valid IUPAC code. Use only characters ' 'XACGTURYSWKMBDHVN.'.format(c, self.sequence)) # Optimization: Use non-wildcard matching if only ACGT is used self.adapter_wildcards = adapter_wildcards and not set(self.sequence) <= set("ACGT") # type: bool self.read_wildcards = read_wildcards # type: bool self.indels = indels # type: bool self.aligner = self._aligner() def _make_aligner(self, flags): # TODO # Indels are suppressed by setting their cost very high, but a different algorithm # should be used instead. indel_cost = 1 if self.indels else 100000 return align.Aligner( self.sequence, self.max_error_rate, flags=flags, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards, indel_cost=indel_cost, min_overlap=self.min_overlap, ) def __repr__(self): return '<{cls}(name={name!r}, sequence={sequence!r}, '\ 'max_error_rate={max_error_rate}, min_overlap={min_overlap}, '\ 'read_wildcards={read_wildcards}, '\ 'adapter_wildcards={adapter_wildcards}, '\ 'indels={indels})>'.format(cls=self.__class__.__name__, **vars(self)) @property def is_anchored(self): """Return whether this adapter is anchored""" return self.where in {Where.PREFIX, Where.SUFFIX} @property def effective_length(self): return self.aligner.effective_length def enable_debug(self): """ Print out the dynamic programming matrix after matching a read to an adapter. """ self._debug = True self.aligner.enable_debug() @abstractmethod def _aligner(self): pass @abstractmethod def match_to(self, sequence: str): """ Attempt to match this adapter to the given string. Return a Match instance if a match was found; return None if no match was found given the matching criteria (minimum overlap length, maximum error rate). """ def __len__(self): return len(self.sequence) def create_statistics(self): return AdapterStatistics(self, self) class FrontAdapter(SingleAdapter): """A 5' adapter""" description = "regular 5'" def __init__(self, *args, **kwargs): self._force_anywhere = kwargs.pop("force_anywhere", False) super().__init__(*args, **kwargs) def _aligner(self): return self._make_aligner(Where.ANYWHERE.value if self._force_anywhere else Where.FRONT.value) def match_to(self, sequence: str): """ Attempt to match this adapter to the given read. Return a Match instance if a match was found; return None if no match was found given the matching criteria (minimum overlap length, maximum error rate). """ alignment = self.aligner.locate(sequence.upper()) # type: Optional[Tuple[int,int,int,int,int,int]] if self._debug: print(self.aligner.dpmatrix) # pragma: no cover if alignment is None: return None return RemoveBeforeMatch(*alignment, adapter=self, sequence=sequence) class BackAdapter(SingleAdapter): """A 3' adapter""" description = "regular 3'" def __init__(self, *args, **kwargs): self._force_anywhere = kwargs.pop("force_anywhere", False) super().__init__(*args, **kwargs) def _aligner(self): return self._make_aligner(Where.ANYWHERE.value if self._force_anywhere else Where.BACK.value) def match_to(self, sequence: str): """ Attempt to match this adapter to the given read. Return a Match instance if a match was found; return None if no match was found given the matching criteria (minimum overlap length, maximum error rate). """ alignment = self.aligner.locate(sequence.upper()) # type: Optional[Tuple[int,int,int,int,int,int]] if self._debug: print(self.aligner.dpmatrix) # pragma: no cover if alignment is None: return None return RemoveAfterMatch(*alignment, adapter=self, sequence=sequence) class AnywhereAdapter(SingleAdapter): """ An adapter that can be 5' or 3'. If a match involves the first base of the read, it is assumed to be a 5' adapter and a 3' otherwise. """ description = "variable 5'/3'" def _aligner(self): return self._make_aligner(Where.ANYWHERE.value) def match_to(self, sequence: str): """ Attempt to match this adapter to the given string. Return a Match instance if a match was found; return None if no match was found given the matching criteria (minimum overlap length, maximum error rate). """ alignment = self.aligner.locate(sequence) if self._debug: print(self.aligner.dpmatrix) # pragma: no cover if alignment is None: return None # guess: if alignment starts at pos 0, it’s a 5' adapter if alignment[2] == 0: # index 2 is rstart match = RemoveBeforeMatch(*alignment, adapter=self, sequence=sequence) # type: ignore else: match = RemoveAfterMatch(*alignment, adapter=self, sequence=sequence) # type: ignore return match class NonInternalFrontAdapter(FrontAdapter): """A non-internal 5' adapter""" description = "non-internal 5'" def _aligner(self): return self._make_aligner(Where.FRONT_NOT_INTERNAL.value) def match_to(self, sequence: str): if not self.adapter_wildcards and sequence.startswith(self.sequence): n = len(self.sequence) return RemoveBeforeMatch( 0, n, 0, n, n, 0, adapter=self, sequence=sequence ) # type: ignore alignment = self.aligner.locate(sequence) if self._debug: try: print(self.aligner.dpmatrix) # pragma: no cover except AttributeError: pass if alignment is None: return None return RemoveBeforeMatch(*alignment, adapter=self, sequence=sequence) # type: ignore class NonInternalBackAdapter(BackAdapter): """A non-internal 3' adapter""" description = "non-internal 3'" def _aligner(self): return self._make_aligner(Where.BACK_NOT_INTERNAL.value) def match_to(self, sequence: str): if not self.adapter_wildcards and sequence.endswith(self.sequence): # Exact match found # astart, astop, rstart, rstop, matches, errors n = len(self.sequence) return RemoveAfterMatch( 0, n, len(sequence) - n, len(sequence), n, 0, adapter=self, sequence=sequence ) # type: ignore alignment = self.aligner.locate(sequence) if self._debug: try: print(self.aligner.dpmatrix) # pragma: no cover except AttributeError: pass if alignment is None: return None return RemoveAfterMatch(*alignment, adapter=self, sequence=sequence) # type: ignore class PrefixAdapter(NonInternalFrontAdapter): """An anchored 5' adapter""" description = "anchored 5'" def _aligner(self): if not self.indels: # TODO or if error rate allows 0 errors anyway return align.PrefixComparer( self.sequence, self.max_error_rate, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards, min_overlap=self.min_overlap ) else: return self._make_aligner(Where.PREFIX.value) class SuffixAdapter(NonInternalBackAdapter): """An anchored 3' adapter""" description = "anchored 3'" def _aligner(self): if not self.indels: # TODO or if error rate allows 0 errors anyway return align.SuffixComparer( self.sequence, self.max_error_rate, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards, min_overlap=self.min_overlap ) else: return self._make_aligner(Where.SUFFIX.value) class LinkedMatch(Match): """ Represent a match of a LinkedAdapter """ def __init__(self, front_match: RemoveBeforeMatch, back_match: RemoveAfterMatch, adapter: "LinkedAdapter"): assert front_match is not None or back_match is not None self.front_match = front_match # type: RemoveBeforeMatch self.back_match = back_match # type: RemoveAfterMatch self.adapter = adapter # type: LinkedAdapter def __repr__(self): return ''.format( self.front_match, self.back_match, self.adapter) @property def matches(self): """Number of matching bases""" m = 0 if self.front_match is not None: m += self.front_match.matches if self.back_match is not None: m += self.back_match.matches return m @property def errors(self): e = 0 if self.front_match is not None: e += self.front_match.errors if self.back_match is not None: e += self.back_match.errors return e def trimmed(self, read): if self.front_match: read = self.front_match.trimmed(read) if self.back_match: read = self.back_match.trimmed(read) return read @property def adjacent_base(self): return self.back_match.adjacent_base def update_statistics(self, statistics): """Update AdapterStatistics in place""" if self.front_match: statistics.front.errors[self.front_match.rstop][self.front_match.errors] += 1 if self.back_match: length = len(self.back_match.sequence) - self.back_match.rstart statistics.back.errors[length][self.back_match.errors] += 1 def remainder_interval(self) -> Tuple[int, int]: matches = [match for match in [self.front_match, self.back_match] if match is not None] return remainder(matches) def get_info_records(self, read) -> List[List]: records = [] for match, namesuffix in [ (self.front_match, ";1"), (self.back_match, ";2"), ]: if match is None: continue record = match.get_info_records(read)[0] record[7] = ("none" if self.adapter.name is None else self.adapter.name) + namesuffix records.append(record) read = match.trimmed(read) return records class LinkedAdapter(Adapter): """A 5' adapter combined with a 3' adapter""" description = "linked" def __init__( self, front_adapter, back_adapter, front_required, back_required, name, ): super().__init__(name) self.front_required = front_required self.back_required = back_required # The following attributes are needed for the report self.where = Where.LINKED self.name = _generate_adapter_name() if name is None else name self.front_adapter = front_adapter self.front_adapter.name = self.name self.back_adapter = back_adapter def enable_debug(self): self.front_adapter.enable_debug() self.back_adapter.enable_debug() def match_to(self, sequence: str) -> Optional[LinkedMatch]: """ Match the two linked adapters against a string """ front_match = self.front_adapter.match_to(sequence) if self.front_required and front_match is None: return None if front_match is not None: sequence = sequence[front_match.trim_slice()] back_match = self.back_adapter.match_to(sequence) if back_match is None and (self.back_required or front_match is None): return None return LinkedMatch(front_match, back_match, self) def create_statistics(self): return AdapterStatistics(self, self.front_adapter, self.back_adapter) @property def sequence(self): return self.front_adapter.sequence + "..." + self.back_adapter.sequence @property def remove(self): return None class MultiAdapter(Adapter, ABC): """ Represent multiple adapters of the same type at once and use an index data structure to speed up matching. This acts like a "normal" Adapter as it provides a match_to method, but is faster with lots of adapters. There are quite a few restrictions: - no indels are allowed - the error rate allows at most 2 mismatches - wildcards in the adapter are not allowed - wildcards in the read are not allowed Use the is_acceptable() method to check individual adapters. """ MultiAdapterIndex = Dict[str, Tuple[SingleAdapter, int, int]] def __init__(self, adapters): """All given adapters must be of the same type""" super().__init__(name="multi_adapter") if not adapters: raise ValueError("Adapter list is empty") for adapter in adapters: self._accept(adapter) self._adapters = adapters self._lengths, self._index = self._make_index() self._make_affix = self._get_make_affix() def __repr__(self): return "MultiAdapter(adapters={!r})".format(self._adapters) @abstractmethod def _get_make_affix(self): pass @abstractmethod def _make_match(self, adapter, length, matches, errors, sequence) -> SingleMatch: pass @classmethod def _accept(cls, adapter): """Raise a ValueError if the adapter is not acceptable""" if adapter.read_wildcards: raise ValueError("Wildcards in the read not supported") if adapter.adapter_wildcards: raise ValueError("Wildcards in the adapter not supported") k = int(len(adapter) * adapter.max_error_rate) if k > 0 and adapter.indels: raise ValueError("Indels not allowed") if k > 2: raise ValueError("Error rate too high") @classmethod def is_acceptable(cls, adapter): """ Return whether this adapter is acceptable for being used by MultiAdapter Adapters are not acceptable if they allow wildcards, allow too many errors, or would lead to a very large index. """ try: cls._accept(adapter) except ValueError: return False return True def _make_index(self) -> Tuple[List[int], "MultiAdapterIndex"]: logger.info('Building index of %s adapters ...', len(self._adapters)) index = dict() # type: MultiAdapter.MultiAdapterIndex lengths = set() has_warned = False for adapter in self._adapters: sequence = adapter.sequence k = int(adapter.max_error_rate * len(sequence)) for s, errors, matches in align.hamming_environment(sequence, k): if s in index: other_adapter, other_errors, other_matches = index[s] if matches < other_matches: continue if other_matches == matches and not has_warned: logger.warning( "Adapters %s %r and %s %r are very similar. At %s allowed errors, " "the sequence %r cannot be assigned uniquely because the number of " "matches is %s compared to both adapters.", other_adapter.name, other_adapter.sequence, adapter.name, adapter.sequence, k, s, matches ) has_warned = True else: index[s] = (adapter, errors, matches) lengths.add(len(s)) logger.info('Built an index containing %s strings.', len(index)) return sorted(lengths, reverse=True), index def match_to(self, sequence: str): """ Match the adapters against a string and return a Match that represents the best match or None if no match was found """ # Check all the prefixes or suffixes (affixes) that could match best_adapter = None # type: Optional[SingleAdapter] best_length = 0 best_m = -1 best_e = 1000 for length in self._lengths: if length < best_m: # No chance of getting the same or a higher number of matches, so we can stop early break affix = self._make_affix(sequence, length) try: adapter, e, m = self._index[affix] except KeyError: continue if m > best_m or (m == best_m and e < best_e): best_adapter = adapter best_e = e best_m = m best_length = length if best_m == -1: return None else: assert best_adapter is not None return self._make_match(best_adapter, best_length, best_m, best_e, sequence) def enable_debug(self): pass class MultiPrefixAdapter(MultiAdapter): @classmethod def _accept(cls, adapter): if not isinstance(adapter, PrefixAdapter): raise ValueError("Only 5' anchored adapters are allowed") return super()._accept(adapter) def _make_match(self, adapter, length, matches, errors, sequence): return RemoveBeforeMatch( astart=0, astop=len(adapter.sequence), rstart=0, rstop=length, matches=matches, errors=errors, adapter=adapter, sequence=sequence, ) def _get_make_affix(self): return self._make_prefix @staticmethod def _make_prefix(s, n): return s[:n] class MultiSuffixAdapter(MultiAdapter): @classmethod def _accept(cls, adapter): if not isinstance(adapter, SuffixAdapter): raise ValueError("Only anchored 3' adapters are allowed") return super()._accept(adapter) def _make_match(self, adapter, length, matches, errors, sequence): return RemoveAfterMatch( astart=0, astop=len(adapter.sequence), rstart=len(sequence) - length, rstop=len(sequence), matches=matches, errors=errors, adapter=adapter, sequence=sequence, ) def _get_make_affix(self): return self._make_suffix @staticmethod def _make_suffix(s, n): return s[-n:] def warn_duplicate_adapters(adapters): d = dict() for adapter in adapters: key = (adapter.__class__, adapter.sequence) if key in d: logger.warning("Adapter %r (%s) was specified multiple times! " "Please make sure that this is what you want.", adapter.sequence, adapter.description) d[key] = adapter.name def remainder(matches: Sequence[Match]) -> Tuple[int, int]: """ Determine which section of the read would not be trimmed. Return a tuple (start, stop) that gives the interval of the untrimmed part relative to the original read. matches must be non-empty """ if not matches: raise ValueError("matches must not be empty") start = 0 for match in matches: match_start, match_stop = match.remainder_interval() start += match_start length = match_stop - match_start return (start, start + length)