You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
96 lines
3.3 KiB
Python
96 lines
3.3 KiB
Python
2 years ago
|
import warnings
|
||
|
from typing import Dict, Optional, Union
|
||
|
|
||
|
from .api import from_bytes, from_fp, from_path, normalize
|
||
|
from .constant import CHARDET_CORRESPONDENCE
|
||
|
from .models import CharsetMatch, CharsetMatches
|
||
|
|
||
|
|
||
|
def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]:
|
||
|
"""
|
||
|
chardet legacy method
|
||
|
Detect the encoding of the given byte string. It should be mostly backward-compatible.
|
||
|
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
|
||
|
This function is deprecated and should be used to migrate your project easily, consult the documentation for
|
||
|
further information. Not planned for removal.
|
||
|
|
||
|
:param byte_str: The byte sequence to examine.
|
||
|
"""
|
||
|
if not isinstance(byte_str, (bytearray, bytes)):
|
||
|
raise TypeError( # pragma: nocover
|
||
|
"Expected object of type bytes or bytearray, got: "
|
||
|
"{0}".format(type(byte_str))
|
||
|
)
|
||
|
|
||
|
if isinstance(byte_str, bytearray):
|
||
|
byte_str = bytes(byte_str)
|
||
|
|
||
|
r = from_bytes(byte_str).best()
|
||
|
|
||
|
encoding = r.encoding if r is not None else None
|
||
|
language = r.language if r is not None and r.language != "Unknown" else ""
|
||
|
confidence = 1.0 - r.chaos if r is not None else None
|
||
|
|
||
|
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
|
||
|
# but chardet does return 'utf-8-sig' and it is a valid codec name.
|
||
|
if r is not None and encoding == "utf_8" and r.bom:
|
||
|
encoding += "_sig"
|
||
|
|
||
|
return {
|
||
|
"encoding": encoding
|
||
|
if encoding not in CHARDET_CORRESPONDENCE
|
||
|
else CHARDET_CORRESPONDENCE[encoding],
|
||
|
"language": language,
|
||
|
"confidence": confidence,
|
||
|
}
|
||
|
|
||
|
|
||
|
class CharsetNormalizerMatch(CharsetMatch):
|
||
|
pass
|
||
|
|
||
|
|
||
|
class CharsetNormalizerMatches(CharsetMatches):
|
||
|
@staticmethod
|
||
|
def from_fp(*args, **kwargs): # type: ignore
|
||
|
warnings.warn( # pragma: nocover
|
||
|
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||
|
"and scheduled to be removed in 3.0",
|
||
|
DeprecationWarning,
|
||
|
)
|
||
|
return from_fp(*args, **kwargs) # pragma: nocover
|
||
|
|
||
|
@staticmethod
|
||
|
def from_bytes(*args, **kwargs): # type: ignore
|
||
|
warnings.warn( # pragma: nocover
|
||
|
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||
|
"and scheduled to be removed in 3.0",
|
||
|
DeprecationWarning,
|
||
|
)
|
||
|
return from_bytes(*args, **kwargs) # pragma: nocover
|
||
|
|
||
|
@staticmethod
|
||
|
def from_path(*args, **kwargs): # type: ignore
|
||
|
warnings.warn( # pragma: nocover
|
||
|
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||
|
"and scheduled to be removed in 3.0",
|
||
|
DeprecationWarning,
|
||
|
)
|
||
|
return from_path(*args, **kwargs) # pragma: nocover
|
||
|
|
||
|
@staticmethod
|
||
|
def normalize(*args, **kwargs): # type: ignore
|
||
|
warnings.warn( # pragma: nocover
|
||
|
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||
|
"and scheduled to be removed in 3.0",
|
||
|
DeprecationWarning,
|
||
|
)
|
||
|
return normalize(*args, **kwargs) # pragma: nocover
|
||
|
|
||
|
|
||
|
class CharsetDetector(CharsetNormalizerMatches):
|
||
|
pass
|
||
|
|
||
|
|
||
|
class CharsetDoctor(CharsetNormalizerMatches):
|
||
|
pass
|