You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
106 lines
3.3 KiB
Python
106 lines
3.3 KiB
Python
"""
|
|
Tools to open .py files as Unicode, using the encoding specified within the file,
|
|
as per PEP 263.
|
|
|
|
Much of the code is taken from the tokenize module in Python 3.2.
|
|
"""
|
|
|
|
import io
|
|
from io import TextIOWrapper, BytesIO
|
|
from pathlib import Path
|
|
import re
|
|
from tokenize import open, detect_encoding
|
|
|
|
cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)", re.UNICODE)
|
|
cookie_comment_re = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)
|
|
|
|
def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True):
|
|
"""Converts a bytes string with python source code to unicode.
|
|
|
|
Unicode strings are passed through unchanged. Byte strings are checked
|
|
for the python source file encoding cookie to determine encoding.
|
|
txt can be either a bytes buffer or a string containing the source
|
|
code.
|
|
"""
|
|
if isinstance(txt, str):
|
|
return txt
|
|
if isinstance(txt, bytes):
|
|
buffer = BytesIO(txt)
|
|
else:
|
|
buffer = txt
|
|
try:
|
|
encoding, _ = detect_encoding(buffer.readline)
|
|
except SyntaxError:
|
|
encoding = "ascii"
|
|
buffer.seek(0)
|
|
with TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True) as text:
|
|
text.mode = 'r'
|
|
if skip_encoding_cookie:
|
|
return u"".join(strip_encoding_cookie(text))
|
|
else:
|
|
return text.read()
|
|
|
|
def strip_encoding_cookie(filelike):
|
|
"""Generator to pull lines from a text-mode file, skipping the encoding
|
|
cookie if it is found in the first two lines.
|
|
"""
|
|
it = iter(filelike)
|
|
try:
|
|
first = next(it)
|
|
if not cookie_comment_re.match(first):
|
|
yield first
|
|
second = next(it)
|
|
if not cookie_comment_re.match(second):
|
|
yield second
|
|
except StopIteration:
|
|
return
|
|
|
|
for line in it:
|
|
yield line
|
|
|
|
def read_py_file(filename, skip_encoding_cookie=True):
|
|
"""Read a Python file, using the encoding declared inside the file.
|
|
|
|
Parameters
|
|
----------
|
|
filename : str
|
|
The path to the file to read.
|
|
skip_encoding_cookie : bool
|
|
If True (the default), and the encoding declaration is found in the first
|
|
two lines, that line will be excluded from the output.
|
|
|
|
Returns
|
|
-------
|
|
A unicode string containing the contents of the file.
|
|
"""
|
|
filepath = Path(filename)
|
|
with open(filepath) as f: # the open function defined in this module.
|
|
if skip_encoding_cookie:
|
|
return "".join(strip_encoding_cookie(f))
|
|
else:
|
|
return f.read()
|
|
|
|
def read_py_url(url, errors='replace', skip_encoding_cookie=True):
|
|
"""Read a Python file from a URL, using the encoding declared inside the file.
|
|
|
|
Parameters
|
|
----------
|
|
url : str
|
|
The URL from which to fetch the file.
|
|
errors : str
|
|
How to handle decoding errors in the file. Options are the same as for
|
|
bytes.decode(), but here 'replace' is the default.
|
|
skip_encoding_cookie : bool
|
|
If True (the default), and the encoding declaration is found in the first
|
|
two lines, that line will be excluded from the output.
|
|
|
|
Returns
|
|
-------
|
|
A unicode string containing the contents of the file.
|
|
"""
|
|
# Deferred import for faster start
|
|
from urllib.request import urlopen
|
|
response = urlopen(url)
|
|
buffer = io.BytesIO(response.read())
|
|
return source_to_unicode(buffer, errors, skip_encoding_cookie)
|