from functools import lru_cache from typing import Optional, List from charset_normalizer.constant import UNICODE_SECONDARY_RANGE_KEYWORD from charset_normalizer.utils import is_punctuation, is_symbol, unicode_range, is_accentuated, is_latin, \ remove_accent, is_separator, is_cjk, is_case_variable, is_hangul, is_katakana, is_hiragana, is_ascii, is_thai class MessDetectorPlugin: """ Base abstract class used for mess detection plugins. All detectors MUST extend and implement given methods. """ def eligible(self, character: str) -> bool: """ Determine if given character should be fed in. """ raise NotImplementedError # pragma: nocover def feed(self, character: str) -> None: """ The main routine to be executed upon character. Insert the logic in witch the text would be considered chaotic. """ raise NotImplementedError # pragma: nocover def reset(self) -> None: """ Permit to reset the plugin to the initial state. """ raise NotImplementedError # pragma: nocover @property def ratio(self) -> float: """ Compute the chaos ratio based on what your feed() has seen. Must NOT be lower than 0.; No restriction gt 0. """ raise NotImplementedError # pragma: nocover class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin): def __init__(self): self._punctuation_count = 0 # type: int self._symbol_count = 0 # type: int self._character_count = 0 # type: int self._last_printable_char = None # type: Optional[str] self._frenzy_symbol_in_word = False # type: bool def eligible(self, character: str) -> bool: return character.isprintable() def feed(self, character: str) -> None: self._character_count += 1 if character != self._last_printable_char and character not in ["<", ">", "=", ":", "/", "&", ";", "{", "}", "[", "]", ",", "|", '"']: if is_punctuation(character): self._punctuation_count += 1 elif character.isdigit() is False and is_symbol(character): self._symbol_count += 2 self._last_printable_char = character def reset(self) -> None: self._punctuation_count = 0 self._character_count = 0 self._symbol_count = 0 @property def ratio(self) -> float: if self._character_count == 0: return 0. ratio_of_punctuation = (self._punctuation_count + self._symbol_count) / self._character_count # type: float return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0. class TooManyAccentuatedPlugin(MessDetectorPlugin): def __init__(self): self._character_count = 0 # type: int self._accentuated_count = 0 # type: int def eligible(self, character: str) -> bool: return character.isalpha() def feed(self, character: str) -> None: self._character_count += 1 if is_accentuated(character): self._accentuated_count += 1 def reset(self) -> None: self._character_count = 0 self._accentuated_count = 0 @property def ratio(self) -> float: if self._character_count == 0: return 0. ratio_of_accentuation = self._accentuated_count / self._character_count # type: float return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0. class UnprintablePlugin(MessDetectorPlugin): def __init__(self): self._unprintable_count = 0 # type: int self._character_count = 0 # type: int def eligible(self, character: str) -> bool: return True def feed(self, character: str) -> None: if character not in {'\n', '\t', '\r', '\v'} and character.isprintable() is False: self._unprintable_count += 1 self._character_count += 1 def reset(self) -> None: self._unprintable_count = 0 @property def ratio(self) -> float: if self._character_count == 0: return 0. return (self._unprintable_count * 8) / self._character_count class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin): def __init__(self): self._successive_count = 0 # type: int self._character_count = 0 # type: int self._last_latin_character = None # type: Optional[str] def eligible(self, character: str) -> bool: return character.isalpha() and is_latin(character) def feed(self, character: str) -> None: self._character_count += 1 if self._last_latin_character is not None: if is_accentuated(character) and is_accentuated(self._last_latin_character): if character.isupper() and self._last_latin_character.isupper(): self._successive_count += 1 # Worse if its the same char duplicated with different accent. if remove_accent(character) == remove_accent(self._last_latin_character): self._successive_count += 1 self._last_latin_character = character def reset(self) -> None: self._successive_count = 0 self._character_count = 0 self._last_latin_character = None @property def ratio(self) -> float: if self._character_count == 0: return 0. return (self._successive_count * 2) / self._character_count class SuspiciousRange(MessDetectorPlugin): def __init__(self): self._suspicious_successive_range_count = 0 # type: int self._character_count = 0 # type: int self._last_printable_seen = None # type: Optional[str] def eligible(self, character: str) -> bool: return character.isprintable() def feed(self, character: str) -> None: self._character_count += 1 if character.isspace() or is_punctuation(character): self._last_printable_seen = None return if self._last_printable_seen is None: self._last_printable_seen = character return unicode_range_a = unicode_range(self._last_printable_seen) # type: Optional[str] unicode_range_b = unicode_range(character) # type: Optional[str] if is_suspiciously_successive_range(unicode_range_a, unicode_range_b): self._suspicious_successive_range_count += 1 self._last_printable_seen = character def reset(self) -> None: self._character_count = 0 self._suspicious_successive_range_count = 0 self._last_printable_seen = None @property def ratio(self) -> float: if self._character_count == 0: return 0. ratio_of_suspicious_range_usage = (self._suspicious_successive_range_count * 2) / self._character_count # type: float if ratio_of_suspicious_range_usage < 0.1: return 0. return ratio_of_suspicious_range_usage class SuperWeirdWordPlugin(MessDetectorPlugin): def __init__(self): self._word_count = 0 # type: int self._bad_word_count = 0 # type: int self._is_current_word_bad = False # type: bool self._foreign_long_watch = False # type: bool self._character_count = 0 # type: int self._bad_character_count = 0 # type: int self._buffer = "" # type: str self._buffer_accent_count = 0 # type: int def eligible(self, character: str) -> bool: return True def feed(self, character: str) -> None: if character.isalpha(): self._buffer = "".join([self._buffer, character]) if is_accentuated(character): self._buffer_accent_count += 1 if self._foreign_long_watch is False and is_latin(character) is False and is_cjk(character) is False and is_hangul(character) is False and is_katakana(character) is False and is_hiragana(character) is False and is_thai(character) is False: self._foreign_long_watch = True return if not self._buffer: return if (character.isspace() or is_punctuation(character) or is_separator(character)) and self._buffer: self._word_count += 1 buffer_length = len(self._buffer) # type: int self._character_count += buffer_length if buffer_length >= 4 and self._buffer_accent_count / buffer_length >= 0.3: self._is_current_word_bad = True if buffer_length >= 24 and self._foreign_long_watch: self._is_current_word_bad = True if self._is_current_word_bad: self._bad_word_count += 1 self._bad_character_count += len(self._buffer) self._is_current_word_bad = False self._foreign_long_watch = False self._buffer = "" self._buffer_accent_count = 0 elif character not in {"<", ">", "-", "="} and character.isdigit() is False and is_symbol(character): self._is_current_word_bad = True self._buffer += character def reset(self) -> None: self._buffer = "" self._is_current_word_bad = False self._foreign_long_watch = False self._bad_word_count = 0 self._word_count = 0 self._character_count = 0 self._bad_character_count = 0 @property def ratio(self) -> float: if self._word_count <= 10: return 0. return self._bad_character_count / self._character_count class CjkInvalidStopPlugin(MessDetectorPlugin): """ GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and can be easily detected. Searching for the overuse of '丅' and '丄'. """ def __init__(self): self._wrong_stop_count = 0 # type: int self._cjk_character_count = 0 # type: int def eligible(self, character: str) -> bool: return True def feed(self, character: str) -> None: if character in ["丅", "丄"]: self._wrong_stop_count += 1 return if is_cjk(character): self._cjk_character_count += 1 def reset(self) -> None: self._wrong_stop_count = 0 self._cjk_character_count = 0 @property def ratio(self) -> float: if self._cjk_character_count < 16: return 0. return self._wrong_stop_count / self._cjk_character_count class ArchaicUpperLowerPlugin(MessDetectorPlugin): def __init__(self): self._buf = False # type: bool self._character_count_since_last_sep = 0 # type: int self._successive_upper_lower_count = 0 # type: int self._successive_upper_lower_count_final = 0 # type: int self._character_count = 0 # type: int self._last_alpha_seen = None # type: Optional[str] self._current_ascii_only = True # type: bool def eligible(self, character: str) -> bool: return True def feed(self, character: str) -> None: is_concerned = character.isalpha() and is_case_variable(character) chunk_sep = is_concerned is False if chunk_sep and self._character_count_since_last_sep > 0: if self._character_count_since_last_sep <= 64 and character.isdigit() is False and self._current_ascii_only is False: self._successive_upper_lower_count_final += self._successive_upper_lower_count self._successive_upper_lower_count = 0 self._character_count_since_last_sep = 0 self._last_alpha_seen = None self._buf = False self._character_count += 1 self._current_ascii_only = True return if self._current_ascii_only is True and is_ascii(character) is False: self._current_ascii_only = False if self._last_alpha_seen is not None: if (character.isupper() and self._last_alpha_seen.islower()) or (character.islower() and self._last_alpha_seen.isupper()): if self._buf is True: self._successive_upper_lower_count += 2 self._buf = False else: self._buf = True else: self._buf = False self._character_count += 1 self._character_count_since_last_sep += 1 self._last_alpha_seen = character def reset(self) -> None: self._character_count = 0 self._character_count_since_last_sep = 0 self._successive_upper_lower_count = 0 self._successive_upper_lower_count_final = 0 self._last_alpha_seen = None self._buf = False self._current_ascii_only = True @property def ratio(self) -> float: if self._character_count == 0: return 0. return self._successive_upper_lower_count_final / self._character_count def is_suspiciously_successive_range(unicode_range_a: Optional[str], unicode_range_b: Optional[str]) -> bool: """ Determine if two Unicode range seen next to each other can be considered as suspicious. """ if unicode_range_a is None or unicode_range_b is None: return True if unicode_range_a == unicode_range_b: return False if "Latin" in unicode_range_a and "Latin" in unicode_range_b: return False if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b: return False keywords_range_a, keywords_range_b = unicode_range_a.split(" "), unicode_range_b.split(" ") for el in keywords_range_a: if el in UNICODE_SECONDARY_RANGE_KEYWORD: continue if el in keywords_range_b: return False # Japanese Exception if unicode_range_a in ['Katakana', 'Hiragana'] and unicode_range_b in ['Katakana', 'Hiragana']: return False if unicode_range_a in ['Katakana', 'Hiragana'] or unicode_range_b in ['Katakana', 'Hiragana']: if "CJK" in unicode_range_a or "CJK" in unicode_range_b: return False if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b: if "CJK" in unicode_range_a or "CJK" in unicode_range_b: return False if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": return False # Chinese/Japanese use dedicated range for punctuation and/or separators. if ('CJK' in unicode_range_a or 'CJK' in unicode_range_b) or (unicode_range_a in ['Katakana', 'Hiragana'] and unicode_range_b in ['Katakana', 'Hiragana']): if 'Punctuation' in unicode_range_a or 'Punctuation' in unicode_range_b: return False if 'Forms' in unicode_range_a or 'Forms' in unicode_range_b: return False return True @lru_cache(maxsize=2048) def mess_ratio(decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False) -> float: """ Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. """ detectors = [] # type: List[MessDetectorPlugin] for md_class in MessDetectorPlugin.__subclasses__(): detectors.append( md_class() ) length = len(decoded_sequence) # type: int mean_mess_ratio = 0. # type: float if length < 512: intermediary_mean_mess_ratio_calc = 32 # type: int elif length <= 1024: intermediary_mean_mess_ratio_calc = 64 else: intermediary_mean_mess_ratio_calc = 128 for character, index in zip(decoded_sequence, range(0, length)): for detector in detectors: if detector.eligible(character): detector.feed(character) if (index > 0 and index % intermediary_mean_mess_ratio_calc == 0) or index == length-1: mean_mess_ratio = sum( [ dt.ratio for dt in detectors ] ) if mean_mess_ratio >= maximum_threshold: break if debug: for dt in detectors: # pragma: nocover print( dt.__class__, dt.ratio ) return round( mean_mess_ratio, 3 )