import re import sys from ast import literal_eval from functools import total_ordering from typing import NamedTuple, Sequence, Union # The following is a list in Python that are line breaks in str.splitlines, but # not in Python. In Python only \r (Carriage Return, 0xD) and \n (Line Feed, # 0xA) are allowed to split lines. _NON_LINE_BREAKS = ( '\v', # Vertical Tabulation 0xB '\f', # Form Feed 0xC '\x1C', # File Separator '\x1D', # Group Separator '\x1E', # Record Separator '\x85', # Next Line (NEL - Equivalent to CR+LF. # Used to mark end-of-line on some IBM mainframes.) '\u2028', # Line Separator '\u2029', # Paragraph Separator ) class Version(NamedTuple): major: int minor: int micro: int def split_lines(string: str, keepends: bool = False) -> Sequence[str]: r""" Intended for Python code. In contrast to Python's :py:meth:`str.splitlines`, looks at form feeds and other special characters as normal text. Just splits ``\n`` and ``\r\n``. Also different: Returns ``[""]`` for an empty string input. In Python 2.7 form feeds are used as normal characters when using str.splitlines. However in Python 3 somewhere there was a decision to split also on form feeds. """ if keepends: lst = string.splitlines(True) # We have to merge lines that were broken by form feed characters. merge = [] for i, line in enumerate(lst): try: last_chr = line[-1] except IndexError: pass else: if last_chr in _NON_LINE_BREAKS: merge.append(i) for index in reversed(merge): try: lst[index] = lst[index] + lst[index + 1] del lst[index + 1] except IndexError: # index + 1 can be empty and therefore there's no need to # merge. pass # The stdlib's implementation of the end is inconsistent when calling # it with/without keepends. One time there's an empty string in the # end, one time there's none. if string.endswith('\n') or string.endswith('\r') or string == '': lst.append('') return lst else: return re.split(r'\n|\r\n|\r', string) def python_bytes_to_unicode( source: Union[str, bytes], encoding: str = 'utf-8', errors: str = 'strict' ) -> str: """ Checks for unicode BOMs and PEP 263 encoding declarations. Then returns a unicode object like in :py:meth:`bytes.decode`. :param encoding: See :py:meth:`bytes.decode` documentation. :param errors: See :py:meth:`bytes.decode` documentation. ``errors`` can be ``'strict'``, ``'replace'`` or ``'ignore'``. """ def detect_encoding(): """ For the implementation of encoding definitions in Python, look at: - http://www.python.org/dev/peps/pep-0263/ - http://docs.python.org/2/reference/lexical_analysis.html#encoding-declarations """ byte_mark = literal_eval(r"b'\xef\xbb\xbf'") if source.startswith(byte_mark): # UTF-8 byte-order mark return 'utf-8' first_two_lines = re.match(br'(?:[^\r\n]*(?:\r\n|\r|\n)){0,2}', source).group(0) possible_encoding = re.search(br"coding[=:]\s*([-\w.]+)", first_two_lines) if possible_encoding: e = possible_encoding.group(1) if not isinstance(e, str): e = str(e, 'ascii', 'replace') return e else: # the default if nothing else has been set -> PEP 263 return encoding if isinstance(source, str): # only cast str/bytes return source encoding = detect_encoding() try: # Cast to unicode return str(source, encoding, errors) except LookupError: if errors == 'replace': # This is a weird case that can happen if the given encoding is not # a valid encoding. This usually shouldn't happen with provided # encodings, but can happen if somebody uses encoding declarations # like `# coding: foo-8`. return str(source, 'utf-8', errors) raise def version_info() -> Version: """ Returns a namedtuple of parso's version, similar to Python's ``sys.version_info``. """ from parso import __version__ tupl = re.findall(r'[a-z]+|\d+', __version__) return Version(*[x if i == 3 else int(x) for i, x in enumerate(tupl)]) class _PythonVersionInfo(NamedTuple): major: int minor: int @total_ordering class PythonVersionInfo(_PythonVersionInfo): def __gt__(self, other): if isinstance(other, tuple): if len(other) != 2: raise ValueError("Can only compare to tuples of length 2.") return (self.major, self.minor) > other super().__gt__(other) return (self.major, self.minor) def __eq__(self, other): if isinstance(other, tuple): if len(other) != 2: raise ValueError("Can only compare to tuples of length 2.") return (self.major, self.minor) == other super().__eq__(other) def __ne__(self, other): return not self.__eq__(other) def _parse_version(version) -> PythonVersionInfo: match = re.match(r'(\d+)(?:\.(\d{1,2})(?:\.\d+)?)?((a|b|rc)\d)?$', version) if match is None: raise ValueError('The given version is not in the right format. ' 'Use something like "3.8" or "3".') major = int(match.group(1)) minor = match.group(2) if minor is None: # Use the latest Python in case it's not exactly defined, because the # grammars are typically backwards compatible? if major == 2: minor = "7" elif major == 3: minor = "6" else: raise NotImplementedError("Sorry, no support yet for those fancy new/old versions.") minor = int(minor) return PythonVersionInfo(major, minor) def parse_version_string(version: str = None) -> PythonVersionInfo: """ Checks for a valid version number (e.g. `3.8` or `3.10.1` or `3`) and returns a corresponding version info that is always two characters long in decimal. """ if version is None: version = '%s.%s' % sys.version_info[:2] if not isinstance(version, str): raise TypeError('version must be a string like "3.8"') return _parse_version(version)