Source code for isbinary.check

from __future__ import annotations

import enum
import os
from typing import Final, Union

from ._chardet import chardet_detect


_default_starting_chunk_len: Final = 2028

_control_chars: Final = b"\n\r\t\f\b"
_printable_ascii: Final = _control_chars + bytes(range(32, 127))
_printable_high_ascii: Final = bytes(range(127, 256))


[docs]def get_starting_chunk( filename: Union[str, os.PathLike], /, *, chunk_len: int = _default_starting_chunk_len ) -> bytes: """ :param filename: File to open and get the first little chunk of. :param chunk_len: Number of bytes to read, default 2048. :return: Starting chunk of bytes. """ with open(filename, "rb") as f: return f.read(chunk_len)
[docs]class BinaryLikeliness(enum.Enum): HIGH = enum.auto() MID = enum.auto() LOW = enum.auto() @property def likely(self) -> bool: return self == BinaryLikeliness.MID or self == BinaryLikeliness.HIGH
[docs]def is_likely_binary(bytes_to_check: bytes, /) -> BinaryLikeliness: """ :param bytes_to_check: A chunk of bytes to check. :return: True if is likely binary, False otherwise. """ # Check for a high percentage of ASCII control characters # Binary if control chars are > 30% of the string low_chars = bytes_to_check.translate(None, _printable_ascii) nontext_ratio1 = float(len(low_chars)) / float(len(bytes_to_check)) # and check for a low percentage of high ASCII characters: # Binary if high ASCII chars are < 5% of the string # From: https://en.wikipedia.org/wiki/UTF-8 # If the bytes are random, the chances of a byte with the high bit set # starting a valid UTF-8 character is only 6.64%. The chances of finding 7 # of these without finding an invalid sequence is actually lower than the # chance of the first three bytes randomly being the UTF-8 BOM. high_chars = bytes_to_check.translate(None, _printable_high_ascii) nontext_ratio2 = float(len(high_chars)) / float(len(bytes_to_check)) if nontext_ratio1 > 0.9 and nontext_ratio2 > 0.9: return BinaryLikeliness.HIGH if nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05: return BinaryLikeliness.MID elif nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8: return BinaryLikeliness.MID else: return BinaryLikeliness.LOW
[docs]def is_decodable_as_unicode(bytes_to_check: bytes, /) -> bool: """ :param bytes_to_check: A chunk of bytes to check. :return: True if is unicode-decodable, False otherwise. """ # Check for binary for possible encoding detection with chardet detected_encoding = chardet_detect(bytes_to_check) # Decide if binary or text decodable_as_unicode = False if detected_encoding["confidence"] > 0.9 and detected_encoding["encoding"] != "ascii": try: bytes_to_check.decode(encoding=detected_encoding["encoding"]) decodable_as_unicode = True except (LookupError, UnicodeDecodeError): pass return decodable_as_unicode
[docs]def has_null_bytes(bytes_to_check: bytes, /) -> bool: """ :param bytes_to_check: A chunk of bytes to check. :return: True if the chunk contains null bytes, False otherwise. """ return b"\x00" in bytes_to_check or b"\xff" in bytes_to_check
[docs]def is_binary_string(bytes_to_check: bytes, /) -> bool: """ Uses a simplified version of the Perl detection algorithm, based roughly on Eli Bendersky's translation to Python: https://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/ This is biased slightly more in favour of deeming files as text files than the Perl algorithm, since all ASCII compatible character sets are accepted as text, not just utf-8. :param bytes_to_check: A chunk of bytes to check. :return: True if the chunk appears to be binary (not text), False otherwise. """ # Empty files are considered text files. if not bytes_to_check: return False likely_binary = is_likely_binary(bytes_to_check) if likely_binary == BinaryLikeliness.HIGH: return True decodable_as_unicode = is_decodable_as_unicode(bytes_to_check) if likely_binary.likely: return not decodable_as_unicode if decodable_as_unicode: return False return has_null_bytes(bytes_to_check)
[docs]def is_binary_file( filename: Union[str, os.PathLike], /, *, starting_chunk_len: int = _default_starting_chunk_len ) -> bool: """ :param filename: File to check. :param starting_chunk_len: Number of bytes to read, default 2048. :return: True if it's a binary file, otherwise False. """ # Check if the starting chunk is a binary string try: chunk = get_starting_chunk(filename, chunk_len=starting_chunk_len) except FileNotFoundError: if os.path.islink(filename) and not os.path.exists(filename): return True raise return is_binary_string(chunk)
__all__ = ( "get_starting_chunk", "BinaryLikeliness", "is_likely_binary", "is_decodable_as_unicode", "has_null_bytes", "is_binary_string", "is_binary_file", )