from __future__ import annotations
import enum
import os
from typing import Final, Union
from ._chardet import chardet_detect
_default_starting_chunk_len: Final = 2028
_control_chars: Final = b"\n\r\t\f\b"
_printable_ascii: Final = _control_chars + bytes(range(32, 127))
_printable_high_ascii: Final = bytes(range(127, 256))
[docs]def get_starting_chunk(
filename: Union[str, os.PathLike], /, *, chunk_len: int = _default_starting_chunk_len
) -> bytes:
"""
:param filename: File to open and get the first little chunk of.
:param chunk_len: Number of bytes to read, default 2048.
:return: Starting chunk of bytes.
"""
with open(filename, "rb") as f:
return f.read(chunk_len)
[docs]class BinaryLikeliness(enum.Enum):
HIGH = enum.auto()
MID = enum.auto()
LOW = enum.auto()
@property
def likely(self) -> bool:
return self == BinaryLikeliness.MID or self == BinaryLikeliness.HIGH
[docs]def is_likely_binary(bytes_to_check: bytes, /) -> BinaryLikeliness:
"""
:param bytes_to_check: A chunk of bytes to check.
:return: True if is likely binary, False otherwise.
"""
# Check for a high percentage of ASCII control characters
# Binary if control chars are > 30% of the string
low_chars = bytes_to_check.translate(None, _printable_ascii)
nontext_ratio1 = float(len(low_chars)) / float(len(bytes_to_check))
# and check for a low percentage of high ASCII characters:
# Binary if high ASCII chars are < 5% of the string
# From: https://en.wikipedia.org/wiki/UTF-8
# If the bytes are random, the chances of a byte with the high bit set
# starting a valid UTF-8 character is only 6.64%. The chances of finding 7
# of these without finding an invalid sequence is actually lower than the
# chance of the first three bytes randomly being the UTF-8 BOM.
high_chars = bytes_to_check.translate(None, _printable_high_ascii)
nontext_ratio2 = float(len(high_chars)) / float(len(bytes_to_check))
if nontext_ratio1 > 0.9 and nontext_ratio2 > 0.9:
return BinaryLikeliness.HIGH
if nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05:
return BinaryLikeliness.MID
elif nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8:
return BinaryLikeliness.MID
else:
return BinaryLikeliness.LOW
[docs]def is_decodable_as_unicode(bytes_to_check: bytes, /) -> bool:
"""
:param bytes_to_check: A chunk of bytes to check.
:return: True if is unicode-decodable, False otherwise.
"""
# Check for binary for possible encoding detection with chardet
detected_encoding = chardet_detect(bytes_to_check)
# Decide if binary or text
decodable_as_unicode = False
if detected_encoding["confidence"] > 0.9 and detected_encoding["encoding"] != "ascii":
try:
bytes_to_check.decode(encoding=detected_encoding["encoding"])
decodable_as_unicode = True
except (LookupError, UnicodeDecodeError):
pass
return decodable_as_unicode
[docs]def has_null_bytes(bytes_to_check: bytes, /) -> bool:
"""
:param bytes_to_check: A chunk of bytes to check.
:return: True if the chunk contains null bytes, False otherwise.
"""
return b"\x00" in bytes_to_check or b"\xff" in bytes_to_check
[docs]def is_binary_string(bytes_to_check: bytes, /) -> bool:
"""
Uses a simplified version of the Perl detection algorithm,
based roughly on Eli Bendersky's translation to Python:
https://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/
This is biased slightly more in favour of deeming files as text
files than the Perl algorithm, since all ASCII compatible character
sets are accepted as text, not just utf-8.
:param bytes_to_check: A chunk of bytes to check.
:return: True if the chunk appears to be binary (not text), False otherwise.
"""
# Empty files are considered text files.
if not bytes_to_check:
return False
likely_binary = is_likely_binary(bytes_to_check)
if likely_binary == BinaryLikeliness.HIGH:
return True
decodable_as_unicode = is_decodable_as_unicode(bytes_to_check)
if likely_binary.likely:
return not decodable_as_unicode
if decodable_as_unicode:
return False
return has_null_bytes(bytes_to_check)
[docs]def is_binary_file(
filename: Union[str, os.PathLike], /, *, starting_chunk_len: int = _default_starting_chunk_len
) -> bool:
"""
:param filename: File to check.
:param starting_chunk_len: Number of bytes to read, default 2048.
:return: True if it's a binary file, otherwise False.
"""
# Check if the starting chunk is a binary string
try:
chunk = get_starting_chunk(filename, chunk_len=starting_chunk_len)
except FileNotFoundError:
if os.path.islink(filename) and not os.path.exists(filename):
return True
raise
return is_binary_string(chunk)
__all__ = (
"get_starting_chunk",
"BinaryLikeliness",
"is_likely_binary",
"is_decodable_as_unicode",
"has_null_bytes",
"is_binary_string",
"is_binary_file",
)