main title

', '\t': '', '\n': '', '\r': '', '\f': '', u'\u00A0': '', u'\u1680': '', u'\u180E': '', u'\u2000': '', u'\u2001': '', u'\u2002': '', u'\u2003': '', u'\u2004': '', u'\u2005': '', u'\u2006': '', u'\u2007': '', u'\u2008': '', u'\u2009': '', u'\u200A': '', u'\u200B': '', u'\u202F': '', u'\u205F': '', u'\u3000': '', } def __init__(self, ws=" \t\r\n", min=1, max=0, exact=0): super(White, self).__init__() self.matchWhite = ws self.setWhitespaceChars("".join(c for c in self.whiteChars if c not in self.matchWhite)) # ~ self.leaveWhitespace() self.name = ("".join(White.whiteStrs[c] for c in self.matchWhite)) self.mayReturnEmpty = True self.errmsg = "Expected " + self.name self.minLen = min if max > 0: self.maxLen = max else: self.maxLen = _MAX_INT if exact > 0: self.maxLen = exact self.minLen = exact def parseImpl(self, instring, loc, doActions=True): if instring[loc] not in self.matchWhite: raise ParseException(instring, loc, self.errmsg, self) start = loc loc += 1 maxloc = start + self.maxLen maxloc = min(maxloc, len(instring)) while loc < maxloc and instring[loc] in self.matchWhite: loc += 1 if loc - start < self.minLen: raise ParseException(instring, loc, self.errmsg, self) return loc, instring[start:loc] class _PositionToken(Token): def __init__(self): super(_PositionToken, self).__init__() self.name = self.__class__.__name__ self.mayReturnEmpty = True self.mayIndexError = False class GoToColumn(_PositionToken): """Token to advance to a specific column of input text; useful for tabular report scraping. """ def __init__(self, colno): super(GoToColumn, self).__init__() self.col = colno def preParse(self, instring, loc): if col(loc, instring) != self.col: instrlen = len(instring) if self.ignoreExprs: loc = self._skipIgnorables(instring, loc) while loc < instrlen and instring[loc].isspace() and col(loc, instring) != self.col: loc += 1 return loc def parseImpl(self, instring, loc, doActions=True): thiscol = col(loc, instring) if thiscol > self.col: raise ParseException(instring, loc, "Text not in expected column", self) newloc = loc + self.col - thiscol ret = instring[loc: newloc] return newloc, ret class LineStart(_PositionToken): r"""Matches if current position is at the beginning of a line within the parse string Example:: test = '''\ AAA this line AAA and this line AAA but not this one B AAA and definitely not this one ''' for t in (LineStart() + 'AAA' + restOfLine).searchString(test): print(t) prints:: ['AAA', ' this line'] ['AAA', ' and this line'] """ def __init__(self): super(LineStart, self).__init__() self.errmsg = "Expected start of line" def parseImpl(self, instring, loc, doActions=True): if col(loc, instring) == 1: return loc, [] raise ParseException(instring, loc, self.errmsg, self) class LineEnd(_PositionToken): """Matches if current position is at the end of a line within the parse string """ def __init__(self): super(LineEnd, self).__init__() self.setWhitespaceChars(ParserElement.DEFAULT_WHITE_CHARS.replace("\n", "")) self.errmsg = "Expected end of line" def parseImpl(self, instring, loc, doActions=True): if loc < len(instring): if instring[loc] == "\n": return loc + 1, "\n" else: raise ParseException(instring, loc, self.errmsg, self) elif loc == len(instring): return loc + 1, [] else: raise ParseException(instring, loc, self.errmsg, self) class StringStart(_PositionToken): """Matches if current position is at the beginning of the parse string """ def __init__(self): super(StringStart, self).__init__() self.errmsg = "Expected start of text" def parseImpl(self, instring, loc, doActions=True): if loc != 0: # see if entire string up to here is just whitespace and ignoreables if loc != self.preParse(instring, 0): raise ParseException(instring, loc, self.errmsg, self) return loc, [] class StringEnd(_PositionToken): """Matches if current position is at the end of the parse string """ def __init__(self): super(StringEnd, self).__init__() self.errmsg = "Expected end of text" def parseImpl(self, instring, loc, doActions=True): if loc < len(instring): raise ParseException(instring, loc, self.errmsg, self) elif loc == len(instring): return loc + 1, [] elif loc > len(instring): return loc, [] else: raise ParseException(instring, loc, self.errmsg, self) class WordStart(_PositionToken): """Matches if the current position is at the beginning of a Word, and is not preceded by any character in a given set of ``wordChars`` (default= ``printables``). To emulate the ``\b`` behavior of regular expressions, use ``WordStart(alphanums)``. ``WordStart`` will also match at the beginning of the string being parsed, or at the beginning of a line. """ def __init__(self, wordChars=printables): super(WordStart, self).__init__() self.wordChars = set(wordChars) self.errmsg = "Not at the start of a word" def parseImpl(self, instring, loc, doActions=True): if loc != 0: if (instring[loc - 1] in self.wordChars or instring[loc] not in self.wordChars): raise ParseException(instring, loc, self.errmsg, self) return loc, [] class WordEnd(_PositionToken): """Matches if the current position is at the end of a Word, and is not followed by any character in a given set of ``wordChars`` (default= ``printables``). To emulate the ``\b`` behavior of regular expressions, use ``WordEnd(alphanums)``. ``WordEnd`` will also match at the end of the string being parsed, or at the end of a line. """ def __init__(self, wordChars=printables): super(WordEnd, self).__init__() self.wordChars = set(wordChars) self.skipWhitespace = False self.errmsg = "Not at the end of a word" def parseImpl(self, instring, loc, doActions=True): instrlen = len(instring) if instrlen > 0 and loc < instrlen: if (instring[loc] in self.wordChars or instring[loc - 1] not in self.wordChars): raise ParseException(instring, loc, self.errmsg, self) return loc, [] class ParseExpression(ParserElement): """Abstract subclass of ParserElement, for combining and post-processing parsed tokens. """ def __init__(self, exprs, savelist=False): super(ParseExpression, self).__init__(savelist) if isinstance(exprs, _generatorType): exprs = list(exprs) if isinstance(exprs, basestring): self.exprs = [self._literalStringClass(exprs)] elif isinstance(exprs, ParserElement): self.exprs = [exprs] elif isinstance(exprs, Iterable): exprs = list(exprs) # if sequence of strings provided, wrap with Literal if any(isinstance(expr, basestring) for expr in exprs): exprs = (self._literalStringClass(e) if isinstance(e, basestring) else e for e in exprs) self.exprs = list(exprs) else: try: self.exprs = list(exprs) except TypeError: self.exprs = [exprs] self.callPreparse = False def append(self, other): self.exprs.append(other) self.strRepr = None return self def leaveWhitespace(self): """Extends ``leaveWhitespace`` defined in base class, and also invokes ``leaveWhitespace`` on all contained expressions.""" self.skipWhitespace = False self.exprs = [e.copy() for e in self.exprs] for e in self.exprs: e.leaveWhitespace() return self def ignore(self, other): if isinstance(other, Suppress): if other not in self.ignoreExprs: super(ParseExpression, self).ignore(other) for e in self.exprs: e.ignore(self.ignoreExprs[-1]) else: super(ParseExpression, self).ignore(other) for e in self.exprs: e.ignore(self.ignoreExprs[-1]) return self def __str__(self): try: return super(ParseExpression, self).__str__() except Exception: pass if self.strRepr is None: self.strRepr = "%s:(%s)" % (self.__class__.__name__, _ustr(self.exprs)) return self.strRepr def streamline(self): super(ParseExpression, self).streamline() for e in self.exprs: e.streamline() # collapse nested And's of the form And(And(And(a, b), c), d) to And(a, b, c, d) # but only if there are no parse actions or resultsNames on the nested And's # (likewise for Or's and MatchFirst's) if len(self.exprs) == 2: other = self.exprs[0] if (isinstance(other, self.__class__) and not other.parseAction and other.resultsName is None and not other.debug): self.exprs = other.exprs[:] + [self.exprs[1]] self.strRepr = None self.mayReturnEmpty |= other.mayReturnEmpty self.mayIndexError |= other.mayIndexError other = self.exprs[-1] if (isinstance(other, self.__class__) and not other.parseAction and other.resultsName is None and not other.debug): self.exprs = self.exprs[:-1] + other.exprs[:] self.strRepr = None self.mayReturnEmpty |= other.mayReturnEmpty self.mayIndexError |= other.mayIndexError self.errmsg = "Expected " + _ustr(self) return self def validate(self, validateTrace=None): tmp = (validateTrace if validateTrace is not None else [])[:] + [self] for e in self.exprs: e.validate(tmp) self.checkRecursion([]) def copy(self): ret = super(ParseExpression, self).copy() ret.exprs = [e.copy() for e in self.exprs] return ret def _setResultsName(self, name, listAllMatches=False): if __diag__.warn_ungrouped_named_tokens_in_collection: for e in self.exprs: if isinstance(e, ParserElement) and e.resultsName: warnings.warn("{0}: setting results name {1!r} on {2} expression " "collides with {3!r} on contained expression".format("warn_ungrouped_named_tokens_in_collection", name, type(self).__name__, e.resultsName), stacklevel=3) return super(ParseExpression, self)._setResultsName(name, listAllMatches) class And(ParseExpression): """ Requires all given :class:`ParseExpression` s to be found in the given order. Expressions may be separated by whitespace. May be constructed using the ``'+'`` operator. May also be constructed using the ``'-'`` operator, which will suppress backtracking. Example:: integer = Word(nums) name_expr = OneOrMore(Word(alphas)) expr = And([integer("id"), name_expr("name"), integer("age")]) # more easily written as: expr = integer("id") + name_expr("name") + integer("age") """ class _ErrorStop(Empty): def __init__(self, *args, **kwargs): super(And._ErrorStop, self).__init__(*args, **kwargs) self.name = '-' self.leaveWhitespace() def __init__(self, exprs, savelist=True): exprs = list(exprs) if exprs and Ellipsis in exprs: tmp = [] for i, expr in enumerate(exprs): if expr is Ellipsis: if i < len(exprs) - 1: skipto_arg = (Empty() + exprs[i + 1]).exprs[-1] tmp.append(SkipTo(skipto_arg)("_skipped*")) else: raise Exception("cannot construct And with sequence ending in ...") else: tmp.append(expr) exprs[:] = tmp super(And, self).__init__(exprs, savelist) self.mayReturnEmpty = all(e.mayReturnEmpty for e in self.exprs) self.setWhitespaceChars(self.exprs[0].whiteChars) self.skipWhitespace = self.exprs[0].skipWhitespace self.callPreparse = True def streamline(self): # collapse any _PendingSkip's if self.exprs: if any(isinstance(e, ParseExpression) and e.exprs and isinstance(e.exprs[-1], _PendingSkip) for e in self.exprs[:-1]): for i, e in enumerate(self.exprs[:-1]): if e is None: continue if (isinstance(e, ParseExpression) and e.exprs and isinstance(e.exprs[-1], _PendingSkip)): e.exprs[-1] = e.exprs[-1] + self.exprs[i + 1] self.exprs[i + 1] = None self.exprs = [e for e in self.exprs if e is not None] super(And, self).streamline() self.mayReturnEmpty = all(e.mayReturnEmpty for e in self.exprs) return self def parseImpl(self, instring, loc, doActions=True): # pass False as last arg to _parse for first element, since we already # pre-parsed the string as part of our And pre-parsing loc, resultlist = self.exprs[0]._parse(instring, loc, doActions, callPreParse=False) errorStop = False for e in self.exprs[1:]: if isinstance(e, And._ErrorStop): errorStop = True continue if errorStop: try: loc, exprtokens = e._parse(instring, loc, doActions) except ParseSyntaxException: raise except ParseBaseException as pe: pe.__traceback__ = None raise ParseSyntaxException._from_exception(pe) except IndexError: raise ParseSyntaxException(instring, len(instring), self.errmsg, self) else: loc, exprtokens = e._parse(instring, loc, doActions) if exprtokens or exprtokens.haskeys(): resultlist += exprtokens return loc, resultlist def __iadd__(self, other): if isinstance(other, basestring): other = self._literalStringClass(other) return self.append(other) # And([self, other]) def checkRecursion(self, parseElementList): subRecCheckList = parseElementList[:] + [self] for e in self.exprs: e.checkRecursion(subRecCheckList) if not e.mayReturnEmpty: break def __str__(self): if hasattr(self, "name"): return self.name if self.strRepr is None: self.strRepr = "{" + " ".join(_ustr(e) for e in self.exprs) + "}" return self.strRepr class Or(ParseExpression): """Requires that at least one :class:`ParseExpression` is found. If two expressions match, the expression that matches the longest string will be used. May be constructed using the ``'^'`` operator. Example:: # construct Or using '^' operator number = Word(nums) ^ Combine(Word(nums) + '.' + Word(nums)) print(number.searchString("123 3.1416 789")) prints:: [['123'], ['3.1416'], ['789']] """ def __init__(self, exprs, savelist=False): super(Or, self).__init__(exprs, savelist) if self.exprs: self.mayReturnEmpty = any(e.mayReturnEmpty for e in self.exprs) else: self.mayReturnEmpty = True def streamline(self): super(Or, self).streamline() if __compat__.collect_all_And_tokens: self.saveAsList = any(e.saveAsList for e in self.exprs) return self def parseImpl(self, instring, loc, doActions=True): maxExcLoc = -1 maxException = None matches = [] for e in self.exprs: try: loc2 = e.tryParse(instring, loc) except ParseException as err: err.__traceback__ = None if err.loc > maxExcLoc: maxException = err maxExcLoc = err.loc except IndexError: if len(instring) > maxExcLoc: maxException = ParseException(instring, len(instring), e.errmsg, self) maxExcLoc = len(instring) else: # save match among all matches, to retry longest to shortest matches.append((loc2, e)) if matches: # re-evaluate all matches in descending order of length of match, in case attached actions # might change whether or how much they match of the input. matches.sort(key=itemgetter(0), reverse=True) if not doActions: # no further conditions or parse actions to change the selection of # alternative, so the first match will be the best match best_expr = matches[0][1] return best_expr._parse(instring, loc, doActions) longest = -1, None for loc1, expr1 in matches: if loc1 <= longest[0]: # already have a longer match than this one will deliver, we are done return longest try: loc2, toks = expr1._parse(instring, loc, doActions) except ParseException as err: err.__traceback__ = None if err.loc > maxExcLoc: maxException = err maxExcLoc = err.loc else: if loc2 >= loc1: return loc2, toks # didn't match as much as before elif loc2 > longest[0]: longest = loc2, toks if longest != (-1, None): return longest if maxException is not None: maxException.msg = self.errmsg raise maxException else: raise ParseException(instring, loc, "no defined alternatives to match", self) def __ixor__(self, other): if isinstance(other, basestring): other = self._literalStringClass(other) return self.append(other) # Or([self, other]) def __str__(self): if hasattr(self, "name"): return self.name if self.strRepr is None: self.strRepr = "{" + " ^ ".join(_ustr(e) for e in self.exprs) + "}" return self.strRepr def checkRecursion(self, parseElementList): subRecCheckList = parseElementList[:] + [self] for e in self.exprs: e.checkRecursion(subRecCheckList) def _setResultsName(self, name, listAllMatches=False): if (not __compat__.collect_all_And_tokens and __diag__.warn_multiple_tokens_in_named_alternation): if any(isinstance(e, And) for e in self.exprs): warnings.warn("{0}: setting results name {1!r} on {2} expression " "may only return a single token for an And alternative, " "in future will return the full list of tokens".format( "warn_multiple_tokens_in_named_alternation", name, type(self).__name__), stacklevel=3) return super(Or, self)._setResultsName(name, listAllMatches) class MatchFirst(ParseExpression): """Requires that at least one :class:`ParseExpression` is found. If two expressions match, the first one listed is the one that will match. May be constructed using the ``'|'`` operator. Example:: # construct MatchFirst using '|' operator # watch the order of expressions to match number = Word(nums) | Combine(Word(nums) + '.' + Word(nums)) print(number.searchString("123 3.1416 789")) # Fail! -> [['123'], ['3'], ['1416'], ['789']] # put more selective expression first number = Combine(Word(nums) + '.' + Word(nums)) | Word(nums) print(number.searchString("123 3.1416 789")) # Better -> [['123'], ['3.1416'], ['789']] """ def __init__(self, exprs, savelist=False): super(MatchFirst, self).__init__(exprs, savelist) if self.exprs: self.mayReturnEmpty = any(e.mayReturnEmpty for e in self.exprs) else: self.mayReturnEmpty = True def streamline(self): super(MatchFirst, self).streamline() if __compat__.collect_all_And_tokens: self.saveAsList = any(e.saveAsList for e in self.exprs) return self def parseImpl(self, instring, loc, doActions=True): maxExcLoc = -1 maxException = None for e in self.exprs: try: ret = e._parse(instring, loc, doActions) return ret except ParseException as err: if err.loc > maxExcLoc: maxException = err maxExcLoc = err.loc except IndexError: if len(instring) > maxExcLoc: maxException = ParseException(instring, len(instring), e.errmsg, self) maxExcLoc = len(instring) # only got here if no expression matched, raise exception for match that made it the furthest else: if maxException is not None: maxException.msg = self.errmsg raise maxException else: raise ParseException(instring, loc, "no defined alternatives to match", self) def __ior__(self, other): if isinstance(other, basestring): other = self._literalStringClass(other) return self.append(other) # MatchFirst([self, other]) def __str__(self): if hasattr(self, "name"): return self.name if self.strRepr is None: self.strRepr = "{" + " | ".join(_ustr(e) for e in self.exprs) + "}" return self.strRepr def checkRecursion(self, parseElementList): subRecCheckList = parseElementList[:] + [self] for e in self.exprs: e.checkRecursion(subRecCheckList) def _setResultsName(self, name, listAllMatches=False): if (not __compat__.collect_all_And_tokens and __diag__.warn_multiple_tokens_in_named_alternation): if any(isinstance(e, And) for e in self.exprs): warnings.warn("{0}: setting results name {1!r} on {2} expression " "may only return a single token for an And alternative, " "in future will return the full list of tokens".format( "warn_multiple_tokens_in_named_alternation", name, type(self).__name__), stacklevel=3) return super(MatchFirst, self)._setResultsName(name, listAllMatches) class Each(ParseExpression): """Requires all given :class:`ParseExpression` s to be found, but in any order. Expressions may be separated by whitespace. May be constructed using the ``'&'`` operator. Example:: color = oneOf("RED ORANGE YELLOW GREEN BLUE PURPLE BLACK WHITE BROWN") shape_type = oneOf("SQUARE CIRCLE TRIANGLE STAR HEXAGON OCTAGON") integer = Word(nums) shape_attr = "shape:" + shape_type("shape") posn_attr = "posn:" + Group(integer("x") + ',' + integer("y"))("posn") color_attr = "color:" + color("color") size_attr = "size:" + integer("size") # use Each (using operator '&') to accept attributes in any order # (shape and posn are required, color and size are optional) shape_spec = shape_attr & posn_attr & Optional(color_attr) & Optional(size_attr) shape_spec.runTests(''' shape: SQUARE color: BLACK posn: 100, 120 shape: CIRCLE size: 50 color: BLUE posn: 50,80 color:GREEN size:20 shape:TRIANGLE posn:20,40 ''' ) prints:: shape: SQUARE color: BLACK posn: 100, 120 ['shape:', 'SQUARE', 'color:', 'BLACK', 'posn:', ['100', ',', '120']] - color: BLACK - posn: ['100', ',', '120'] - x: 100 - y: 120 - shape: SQUARE shape: CIRCLE size: 50 color: BLUE posn: 50,80 ['shape:', 'CIRCLE', 'size:', '50', 'color:', 'BLUE', 'posn:', ['50', ',', '80']] - color: BLUE - posn: ['50', ',', '80'] - x: 50 - y: 80 - shape: CIRCLE - size: 50 color: GREEN size: 20 shape: TRIANGLE posn: 20,40 ['color:', 'GREEN', 'size:', '20', 'shape:', 'TRIANGLE', 'posn:', ['20', ',', '40']] - color: GREEN - posn: ['20', ',', '40'] - x: 20 - y: 40 - shape: TRIANGLE - size: 20 """ def __init__(self, exprs, savelist=True): super(Each, self).__init__(exprs, savelist) self.mayReturnEmpty = all(e.mayReturnEmpty for e in self.exprs) self.skipWhitespace = True self.initExprGroups = True self.saveAsList = True def streamline(self): super(Each, self).streamline() self.mayReturnEmpty = all(e.mayReturnEmpty for e in self.exprs) return self def parseImpl(self, instring, loc, doActions=True): if self.initExprGroups: self.opt1map = dict((id(e.expr), e) for e in self.exprs if isinstance(e, Optional)) opt1 = [e.expr for e in self.exprs if isinstance(e, Optional)] opt2 = [e for e in self.exprs if e.mayReturnEmpty and not isinstance(e, (Optional, Regex))] self.optionals = opt1 + opt2 self.multioptionals = [e.expr for e in self.exprs if isinstance(e, ZeroOrMore)] self.multirequired = [e.expr for e in self.exprs if isinstance(e, OneOrMore)] self.required = [e for e in self.exprs if not isinstance(e, (Optional, ZeroOrMore, OneOrMore))] self.required += self.multirequired self.initExprGroups = False tmpLoc = loc tmpReqd = self.required[:] tmpOpt = self.optionals[:] matchOrder = [] keepMatching = True while keepMatching: tmpExprs = tmpReqd + tmpOpt + self.multioptionals + self.multirequired failed = [] for e in tmpExprs: try: tmpLoc = e.tryParse(instring, tmpLoc) except ParseException: failed.append(e) else: matchOrder.append(self.opt1map.get(id(e), e)) if e in tmpReqd: tmpReqd.remove(e) elif e in tmpOpt: tmpOpt.remove(e) if len(failed) == len(tmpExprs): keepMatching = False if tmpReqd: missing = ", ".join(_ustr(e) for e in tmpReqd) raise ParseException(instring, loc, "Missing one or more required elements (%s)" % missing) # add any unmatched Optionals, in case they have default values defined matchOrder += [e for e in self.exprs if isinstance(e, Optional) and e.expr in tmpOpt] resultlist = [] for e in matchOrder: loc, results = e._parse(instring, loc, doActions) resultlist.append(results) finalResults = sum(resultlist, ParseResults([])) return loc, finalResults def __str__(self): if hasattr(self, "name"): return self.name if self.strRepr is None: self.strRepr = "{" + " & ".join(_ustr(e) for e in self.exprs) + "}" return self.strRepr def checkRecursion(self, parseElementList): subRecCheckList = parseElementList[:] + [self] for e in self.exprs: e.checkRecursion(subRecCheckList) class ParseElementEnhance(ParserElement): """Abstract subclass of :class:`ParserElement`, for combining and post-processing parsed tokens. """ def __init__(self, expr, savelist=False): super(ParseElementEnhance, self).__init__(savelist) if isinstance(expr, basestring): if issubclass(self._literalStringClass, Token): expr = self._literalStringClass(expr) else: expr = self._literalStringClass(Literal(expr)) self.expr = expr self.strRepr = None if expr is not None: self.mayIndexError = expr.mayIndexError self.mayReturnEmpty = expr.mayReturnEmpty self.setWhitespaceChars(expr.whiteChars) self.skipWhitespace = expr.skipWhitespace self.saveAsList = expr.saveAsList self.callPreparse = expr.callPreparse self.ignoreExprs.extend(expr.ignoreExprs) def parseImpl(self, instring, loc, doActions=True): if self.expr is not None: return self.expr._parse(instring, loc, doActions, callPreParse=False) else: raise ParseException("", loc, self.errmsg, self) def leaveWhitespace(self): self.skipWhitespace = False self.expr = self.expr.copy() if self.expr is not None: self.expr.leaveWhitespace() return self def ignore(self, other): if isinstance(other, Suppress): if other not in self.ignoreExprs: super(ParseElementEnhance, self).ignore(other) if self.expr is not None: self.expr.ignore(self.ignoreExprs[-1]) else: super(ParseElementEnhance, self).ignore(other) if self.expr is not None: self.expr.ignore(self.ignoreExprs[-1]) return self def streamline(self): super(ParseElementEnhance, self).streamline() if self.expr is not None: self.expr.streamline() return self def checkRecursion(self, parseElementList): if self in parseElementList: raise RecursiveGrammarException(parseElementList + [self]) subRecCheckList = parseElementList[:] + [self] if self.expr is not None: self.expr.checkRecursion(subRecCheckList) def validate(self, validateTrace=None): if validateTrace is None: validateTrace = [] tmp = validateTrace[:] + [self] if self.expr is not None: self.expr.validate(tmp) self.checkRecursion([]) def __str__(self): try: return super(ParseElementEnhance, self).__str__() except Exception: pass if self.strRepr is None and self.expr is not None: self.strRepr = "%s:(%s)" % (self.__class__.__name__, _ustr(self.expr)) return self.strRepr class FollowedBy(ParseElementEnhance): """Lookahead matching of the given parse expression. ``FollowedBy`` does *not* advance the parsing position within the input string, it only verifies that the specified parse expression matches at the current position. ``FollowedBy`` always returns a null token list. If any results names are defined in the lookahead expression, those *will* be returned for access by name. Example:: # use FollowedBy to match a label only if it is followed by a ':' data_word = Word(alphas) label = data_word + FollowedBy(':') attr_expr = Group(label + Suppress(':') + OneOrMore(data_word, stopOn=label).setParseAction(' '.join)) OneOrMore(attr_expr).parseString("shape: SQUARE color: BLACK posn: upper left").pprint() prints:: [['shape', 'SQUARE'], ['color', 'BLACK'], ['posn', 'upper left']] """ def __init__(self, expr): super(FollowedBy, self).__init__(expr) self.mayReturnEmpty = True def parseImpl(self, instring, loc, doActions=True): # by using self._expr.parse and deleting the contents of the returned ParseResults list # we keep any named results that were defined in the FollowedBy expression _, ret = self.expr._parse(instring, loc, doActions=doActions) del ret[:] return loc, ret class PrecededBy(ParseElementEnhance): """Lookbehind matching of the given parse expression. ``PrecededBy`` does not advance the parsing position within the input string, it only verifies that the specified parse expression matches prior to the current position. ``PrecededBy`` always returns a null token list, but if a results name is defined on the given expression, it is returned. Parameters: - expr - expression that must match prior to the current parse location - retreat - (default= ``None``) - (int) maximum number of characters to lookbehind prior to the current parse location If the lookbehind expression is a string, Literal, Keyword, or a Word or CharsNotIn with a specified exact or maximum length, then the retreat parameter is not required. Otherwise, retreat must be specified to give a maximum number of characters to look back from the current parse position for a lookbehind match. Example:: # VB-style variable names with type prefixes int_var = PrecededBy("#") + pyparsing_common.identifier str_var = PrecededBy("$") + pyparsing_common.identifier """ def __init__(self, expr, retreat=None): super(PrecededBy, self).__init__(expr) self.expr = self.expr().leaveWhitespace() self.mayReturnEmpty = True self.mayIndexError = False self.exact = False if isinstance(expr, str): retreat = len(expr) self.exact = True elif isinstance(expr, (Literal, Keyword)): retreat = expr.matchLen self.exact = True elif isinstance(expr, (Word, CharsNotIn)) and expr.maxLen != _MAX_INT: retreat = expr.maxLen self.exact = True elif isinstance(expr, _PositionToken): retreat = 0 self.exact = True self.retreat = retreat self.errmsg = "not preceded by " + str(expr) self.skipWhitespace = False self.parseAction.append(lambda s, l, t: t.__delitem__(slice(None, None))) def parseImpl(self, instring, loc=0, doActions=True): if self.exact: if loc < self.retreat: raise ParseException(instring, loc, self.errmsg) start = loc - self.retreat _, ret = self.expr._parse(instring, start) else: # retreat specified a maximum lookbehind window, iterate test_expr = self.expr + StringEnd() instring_slice = instring[max(0, loc - self.retreat):loc] last_expr = ParseException(instring, loc, self.errmsg) for offset in range(1, min(loc, self.retreat + 1)+1): try: # print('trying', offset, instring_slice, repr(instring_slice[loc - offset:])) _, ret = test_expr._parse(instring_slice, len(instring_slice) - offset) except ParseBaseException as pbe: last_expr = pbe else: break else: raise last_expr return loc, ret class NotAny(ParseElementEnhance): """Lookahead to disallow matching with the given parse expression. ``NotAny`` does *not* advance the parsing position within the input string, it only verifies that the specified parse expression does *not* match at the current position. Also, ``NotAny`` does *not* skip over leading whitespace. ``NotAny`` always returns a null token list. May be constructed using the '~' operator. Example:: AND, OR, NOT = map(CaselessKeyword, "AND OR NOT".split()) # take care not to mistake keywords for identifiers ident = ~(AND | OR | NOT) + Word(alphas) boolean_term = Optional(NOT) + ident # very crude boolean expression - to support parenthesis groups and # operation hierarchy, use infixNotation boolean_expr = boolean_term + ZeroOrMore((AND | OR) + boolean_term) # integers that are followed by "." are actually floats integer = Word(nums) + ~Char(".") """ def __init__(self, expr): super(NotAny, self).__init__(expr) # ~ self.leaveWhitespace() self.skipWhitespace = False # do NOT use self.leaveWhitespace(), don't want to propagate to exprs self.mayReturnEmpty = True self.errmsg = "Found unwanted token, " + _ustr(self.expr) def parseImpl(self, instring, loc, doActions=True): if self.expr.canParseNext(instring, loc): raise ParseException(instring, loc, self.errmsg, self) return loc, [] def __str__(self): if hasattr(self, "name"): return self.name if self.strRepr is None: self.strRepr = "~{" + _ustr(self.expr) + "}" return self.strRepr class _MultipleMatch(ParseElementEnhance): def __init__(self, expr, stopOn=None): super(_MultipleMatch, self).__init__(expr) self.saveAsList = True ender = stopOn if isinstance(ender, basestring): ender = self._literalStringClass(ender) self.stopOn(ender) def stopOn(self, ender): if isinstance(ender, basestring): ender = self._literalStringClass(ender) self.not_ender = ~ender if ender is not None else None return self def parseImpl(self, instring, loc, doActions=True): self_expr_parse = self.expr._parse self_skip_ignorables = self._skipIgnorables check_ender = self.not_ender is not None if check_ender: try_not_ender = self.not_ender.tryParse # must be at least one (but first see if we are the stopOn sentinel; # if so, fail) if check_ender: try_not_ender(instring, loc) loc, tokens = self_expr_parse(instring, loc, doActions, callPreParse=False) try: hasIgnoreExprs = (not not self.ignoreExprs) while 1: if check_ender: try_not_ender(instring, loc) if hasIgnoreExprs: preloc = self_skip_ignorables(instring, loc) else: preloc = loc loc, tmptokens = self_expr_parse(instring, preloc, doActions) if tmptokens or tmptokens.haskeys(): tokens += tmptokens except (ParseException, IndexError): pass return loc, tokens def _setResultsName(self, name, listAllMatches=False): if __diag__.warn_ungrouped_named_tokens_in_collection: for e in [self.expr] + getattr(self.expr, 'exprs', []): if isinstance(e, ParserElement) and e.resultsName: warnings.warn("{0}: setting results name {1!r} on {2} expression " "collides with {3!r} on contained expression".format("warn_ungrouped_named_tokens_in_collection", name, type(self).__name__, e.resultsName), stacklevel=3) return super(_MultipleMatch, self)._setResultsName(name, listAllMatches) class OneOrMore(_MultipleMatch): """Repetition of one or more of the given expression. Parameters: - expr - expression that must match one or more times - stopOn - (default= ``None``) - expression for a terminating sentinel (only required if the sentinel would ordinarily match the repetition expression) Example:: data_word = Word(alphas) label = data_word + FollowedBy(':') attr_expr = Group(label + Suppress(':') + OneOrMore(data_word).setParseAction(' '.join)) text = "shape: SQUARE posn: upper left color: BLACK" OneOrMore(attr_expr).parseString(text).pprint() # Fail! read 'color' as data instead of next label -> [['shape', 'SQUARE color']] # use stopOn attribute for OneOrMore to avoid reading label string as part of the data attr_expr = Group(label + Suppress(':') + OneOrMore(data_word, stopOn=label).setParseAction(' '.join)) OneOrMore(attr_expr).parseString(text).pprint() # Better -> [['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'BLACK']] # could also be written as (attr_expr * (1,)).parseString(text).pprint() """ def __str__(self): if hasattr(self, "name"): return self.name if self.strRepr is None: self.strRepr = "{" + _ustr(self.expr) + "}..." return self.strRepr class ZeroOrMore(_MultipleMatch): """Optional repetition of zero or more of the given expression. Parameters: - expr - expression that must match zero or more times - stopOn - (default= ``None``) - expression for a terminating sentinel (only required if the sentinel would ordinarily match the repetition expression) Example: similar to :class:`OneOrMore` """ def __init__(self, expr, stopOn=None): super(ZeroOrMore, self).__init__(expr, stopOn=stopOn) self.mayReturnEmpty = True def parseImpl(self, instring, loc, doActions=True): try: return super(ZeroOrMore, self).parseImpl(instring, loc, doActions) except (ParseException, IndexError): return loc, [] def __str__(self): if hasattr(self, "name"): return self.name if self.strRepr is None: self.strRepr = "[" + _ustr(self.expr) + "]..." return self.strRepr class _NullToken(object): def __bool__(self): return False __nonzero__ = __bool__ def __str__(self): return "" class Optional(ParseElementEnhance): """Optional matching of the given expression. Parameters: - expr - expression that must match zero or more times - default (optional) - value to be returned if the optional expression is not found. Example:: # US postal code can be a 5-digit zip, plus optional 4-digit qualifier zip = Combine(Word(nums, exact=5) + Optional('-' + Word(nums, exact=4))) zip.runTests(''' # traditional ZIP code 12345 # ZIP+4 form 12101-0001 # invalid ZIP 98765- ''') prints:: # traditional ZIP code 12345 ['12345'] # ZIP+4 form 12101-0001 ['12101-0001'] # invalid ZIP 98765- ^ FAIL: Expected end of text (at char 5), (line:1, col:6) """ __optionalNotMatched = _NullToken() def __init__(self, expr, default=__optionalNotMatched): super(Optional, self).__init__(expr, savelist=False) self.saveAsList = self.expr.saveAsList self.defaultValue = default self.mayReturnEmpty = True def parseImpl(self, instring, loc, doActions=True): try: loc, tokens = self.expr._parse(instring, loc, doActions, callPreParse=False) except (ParseException, IndexError): if self.defaultValue is not self.__optionalNotMatched: if self.expr.resultsName: tokens = ParseResults([self.defaultValue]) tokens[self.expr.resultsName] = self.defaultValue else: tokens = [self.defaultValue] else: tokens = [] return loc, tokens def __str__(self): if hasattr(self, "name"): return self.name if self.strRepr is None: self.strRepr = "[" + _ustr(self.expr) + "]" return self.strRepr class SkipTo(ParseElementEnhance): """Token for skipping over all undefined text until the matched expression is found. Parameters: - expr - target expression marking the end of the data to be skipped - include - (default= ``False``) if True, the target expression is also parsed (the skipped text and target expression are returned as a 2-element list). - ignore - (default= ``None``) used to define grammars (typically quoted strings and comments) that might contain false matches to the target expression - failOn - (default= ``None``) define expressions that are not allowed to be included in the skipped test; if found before the target expression is found, the SkipTo is not a match Example:: report = ''' Outstanding Issues Report - 1 Jan 2000 # | Severity | Description | Days Open -----+----------+-------------------------------------------+----------- 101 | Critical | Intermittent system crash | 6 94 | Cosmetic | Spelling error on Login ('log|n') | 14 79 | Minor | System slow when running too many reports | 47 ''' integer = Word(nums) SEP = Suppress('|') # use SkipTo to simply match everything up until the next SEP # - ignore quoted strings, so that a '|' character inside a quoted string does not match # - parse action will call token.strip() for each matched token, i.e., the description body string_data = SkipTo(SEP, ignore=quotedString) string_data.setParseAction(tokenMap(str.strip)) ticket_expr = (integer("issue_num") + SEP + string_data("sev") + SEP + string_data("desc") + SEP + integer("days_open")) for tkt in ticket_expr.searchString(report): print tkt.dump() prints:: ['101', 'Critical', 'Intermittent system crash', '6'] - days_open: 6 - desc: Intermittent system crash - issue_num: 101 - sev: Critical ['94', 'Cosmetic', "Spelling error on Login ('log|n')", '14'] - days_open: 14 - desc: Spelling error on Login ('log|n') - issue_num: 94 - sev: Cosmetic ['79', 'Minor', 'System slow when running too many reports', '47'] - days_open: 47 - desc: System slow when running too many reports - issue_num: 79 - sev: Minor """ def __init__(self, other, include=False, ignore=None, failOn=None): super(SkipTo, self).__init__(other) self.ignoreExpr = ignore self.mayReturnEmpty = True self.mayIndexError = False self.includeMatch = include self.saveAsList = False if isinstance(failOn, basestring): self.failOn = self._literalStringClass(failOn) else: self.failOn = failOn self.errmsg = "No match found for " + _ustr(self.expr) def parseImpl(self, instring, loc, doActions=True): startloc = loc instrlen = len(instring) expr = self.expr expr_parse = self.expr._parse self_failOn_canParseNext = self.failOn.canParseNext if self.failOn is not None else None self_ignoreExpr_tryParse = self.ignoreExpr.tryParse if self.ignoreExpr is not None else None tmploc = loc while tmploc <= instrlen: if self_failOn_canParseNext is not None: # break if failOn expression matches if self_failOn_canParseNext(instring, tmploc): break if self_ignoreExpr_tryParse is not None: # advance past ignore expressions while 1: try: tmploc = self_ignoreExpr_tryParse(instring, tmploc) except ParseBaseException: break try: expr_parse(instring, tmploc, doActions=False, callPreParse=False) except (ParseException, IndexError): # no match, advance loc in string tmploc += 1 else: # matched skipto expr, done break else: # ran off the end of the input string without matching skipto expr, fail raise ParseException(instring, loc, self.errmsg, self) # build up return values loc = tmploc skiptext = instring[startloc:loc] skipresult = ParseResults(skiptext) if self.includeMatch: loc, mat = expr_parse(instring, loc, doActions, callPreParse=False) skipresult += mat return loc, skipresult class Forward(ParseElementEnhance): """Forward declaration of an expression to be defined later - used for recursive grammars, such as algebraic infix notation. When the expression is known, it is assigned to the ``Forward`` variable using the '<<' operator. Note: take care when assigning to ``Forward`` not to overlook precedence of operators. Specifically, '|' has a lower precedence than '<<', so that:: fwdExpr << a | b | c will actually be evaluated as:: (fwdExpr << a) | b | c thereby leaving b and c out as parseable alternatives. It is recommended that you explicitly group the values inserted into the ``Forward``:: fwdExpr << (a | b | c) Converting to use the '<<=' operator instead will avoid this problem. See :class:`ParseResults.pprint` for an example of a recursive parser created using ``Forward``. """ def __init__(self, other=None): super(Forward, self).__init__(other, savelist=False) def __lshift__(self, other): if isinstance(other, basestring): other = self._literalStringClass(other) self.expr = other self.strRepr = None self.mayIndexError = self.expr.mayIndexError self.mayReturnEmpty = self.expr.mayReturnEmpty self.setWhitespaceChars(self.expr.whiteChars) self.skipWhitespace = self.expr.skipWhitespace self.saveAsList = self.expr.saveAsList self.ignoreExprs.extend(self.expr.ignoreExprs) return self def __ilshift__(self, other): return self << other def leaveWhitespace(self): self.skipWhitespace = False return self def streamline(self): if not self.streamlined: self.streamlined = True if self.expr is not None: self.expr.streamline() return self def validate(self, validateTrace=None): if validateTrace is None: validateTrace = [] if self not in validateTrace: tmp = validateTrace[:] + [self] if self.expr is not None: self.expr.validate(tmp) self.checkRecursion([]) def __str__(self): if hasattr(self, "name"): return self.name if self.strRepr is not None: return self.strRepr # Avoid infinite recursion by setting a temporary strRepr self.strRepr = ": ..." # Use the string representation of main expression. retString = '...' try: if self.expr is not None: retString = _ustr(self.expr)[:1000] else: retString = "None" finally: self.strRepr = self.__class__.__name__ + ": " + retString return self.strRepr def copy(self): if self.expr is not None: return super(Forward, self).copy() else: ret = Forward() ret <<= self return ret def _setResultsName(self, name, listAllMatches=False): if __diag__.warn_name_set_on_empty_Forward: if self.expr is None: warnings.warn("{0}: setting results name {0!r} on {1} expression " "that has no contained expression".format("warn_name_set_on_empty_Forward", name, type(self).__name__), stacklevel=3) return super(Forward, self)._setResultsName(name, listAllMatches) class TokenConverter(ParseElementEnhance): """ Abstract subclass of :class:`ParseExpression`, for converting parsed results. """ def __init__(self, expr, savelist=False): super(TokenConverter, self).__init__(expr) # , savelist) self.saveAsList = False class Combine(TokenConverter): """Converter to concatenate all matching tokens to a single string. By default, the matching patterns must also be contiguous in the input string; this can be disabled by specifying ``'adjacent=False'`` in the constructor. Example:: real = Word(nums) + '.' + Word(nums) print(real.parseString('3.1416')) # -> ['3', '.', '1416'] # will also erroneously match the following print(real.parseString('3. 1416')) # -> ['3', '.', '1416'] real = Combine(Word(nums) + '.' + Word(nums)) print(real.parseString('3.1416')) # -> ['3.1416'] # no match when there are internal spaces print(real.parseString('3. 1416')) # -> Exception: Expected W:(0123...) """ def __init__(self, expr, joinString="", adjacent=True): super(Combine, self).__init__(expr) # suppress whitespace-stripping in contained parse expressions, but re-enable it on the Combine itself if adjacent: self.leaveWhitespace() self.adjacent = adjacent self.skipWhitespace = True self.joinString = joinString self.callPreparse = True def ignore(self, other): if self.adjacent: ParserElement.ignore(self, other) else: super(Combine, self).ignore(other) return self def postParse(self, instring, loc, tokenlist): retToks = tokenlist.copy() del retToks[:] retToks += ParseResults(["".join(tokenlist._asStringList(self.joinString))], modal=self.modalResults) if self.resultsName and retToks.haskeys(): return [retToks] else: return retToks class Group(TokenConverter): """Converter to return the matched tokens as a list - useful for returning tokens of :class:`ZeroOrMore` and :class:`OneOrMore` expressions. Example:: ident = Word(alphas) num = Word(nums) term = ident | num func = ident + Optional(delimitedList(term)) print(func.parseString("fn a, b, 100")) # -> ['fn', 'a', 'b', '100'] func = ident + Group(Optional(delimitedList(term))) print(func.parseString("fn a, b, 100")) # -> ['fn', ['a', 'b', '100']] """ def __init__(self, expr): super(Group, self).__init__(expr) self.saveAsList = True def postParse(self, instring, loc, tokenlist): return [tokenlist] class Dict(TokenConverter): """Converter to return a repetitive expression as a list, but also as a dictionary. Each element can also be referenced using the first token in the expression as its key. Useful for tabular report scraping when the first column can be used as a item key. Example:: data_word = Word(alphas) label = data_word + FollowedBy(':') attr_expr = Group(label + Suppress(':') + OneOrMore(data_word).setParseAction(' '.join)) text = "shape: SQUARE posn: upper left color: light blue texture: burlap" attr_expr = (label + Suppress(':') + OneOrMore(data_word, stopOn=label).setParseAction(' '.join)) # print attributes as plain groups print(OneOrMore(attr_expr).parseString(text).dump()) # instead of OneOrMore(expr), parse using Dict(OneOrMore(Group(expr))) - Dict will auto-assign names result = Dict(OneOrMore(Group(attr_expr))).parseString(text) print(result.dump()) # access named fields as dict entries, or output as dict print(result['shape']) print(result.asDict()) prints:: ['shape', 'SQUARE', 'posn', 'upper left', 'color', 'light blue', 'texture', 'burlap'] [['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'light blue'], ['texture', 'burlap']] - color: light blue - posn: upper left - shape: SQUARE - texture: burlap SQUARE {'color': 'light blue', 'posn': 'upper left', 'texture': 'burlap', 'shape': 'SQUARE'} See more examples at :class:`ParseResults` of accessing fields by results name. """ def __init__(self, expr): super(Dict, self).__init__(expr) self.saveAsList = True def postParse(self, instring, loc, tokenlist): for i, tok in enumerate(tokenlist): if len(tok) == 0: continue ikey = tok[0] if isinstance(ikey, int): ikey = _ustr(tok[0]).strip() if len(tok) == 1: tokenlist[ikey] = _ParseResultsWithOffset("", i) elif len(tok) == 2 and not isinstance(tok[1], ParseResults): tokenlist[ikey] = _ParseResultsWithOffset(tok[1], i) else: dictvalue = tok.copy() # ParseResults(i) del dictvalue[0] if len(dictvalue) != 1 or (isinstance(dictvalue, ParseResults) and dictvalue.haskeys()): tokenlist[ikey] = _ParseResultsWithOffset(dictvalue, i) else: tokenlist[ikey] = _ParseResultsWithOffset(dictvalue[0], i) if self.resultsName: return [tokenlist] else: return tokenlist class Suppress(TokenConverter): """Converter for ignoring the results of a parsed expression. Example:: source = "a, b, c,d" wd = Word(alphas) wd_list1 = wd + ZeroOrMore(',' + wd) print(wd_list1.parseString(source)) # often, delimiters that are useful during parsing are just in the # way afterward - use Suppress to keep them out of the parsed output wd_list2 = wd + ZeroOrMore(Suppress(',') + wd) print(wd_list2.parseString(source)) prints:: ['a', ',', 'b', ',', 'c', ',', 'd'] ['a', 'b', 'c', 'd'] (See also :class:`delimitedList`.) """ def postParse(self, instring, loc, tokenlist): return [] def suppress(self): return self class OnlyOnce(object): """Wrapper for parse actions, to ensure they are only called once. """ def __init__(self, methodCall): self.callable = _trim_arity(methodCall) self.called = False def __call__(self, s, l, t): if not self.called: results = self.callable(s, l, t) self.called = True return results raise ParseException(s, l, "") def reset(self): self.called = False def traceParseAction(f): """Decorator for debugging parse actions. When the parse action is called, this decorator will print ``">> entering method-name(line:, , )"``. When the parse action completes, the decorator will print ``"<<"`` followed by the returned value, or any exception that the parse action raised. Example:: wd = Word(alphas) @traceParseAction def remove_duplicate_chars(tokens): return ''.join(sorted(set(''.join(tokens)))) wds = OneOrMore(wd).setParseAction(remove_duplicate_chars) print(wds.parseString("slkdjs sld sldd sdlf sdljf")) prints:: >>entering remove_duplicate_chars(line: 'slkdjs sld sldd sdlf sdljf', 0, (['slkdjs', 'sld', 'sldd', 'sdlf', 'sdljf'], {})) < 3: thisFunc = paArgs[0].__class__.__name__ + '.' + thisFunc sys.stderr.write(">>entering %s(line: '%s', %d, %r)\n" % (thisFunc, line(l, s), l, t)) try: ret = f(*paArgs) except Exception as exc: sys.stderr.write("< ['aa', 'bb', 'cc'] delimitedList(Word(hexnums), delim=':', combine=True).parseString("AA:BB:CC:DD:EE") # -> ['AA:BB:CC:DD:EE'] """ dlName = _ustr(expr) + " [" + _ustr(delim) + " " + _ustr(expr) + "]..." if combine: return Combine(expr + ZeroOrMore(delim + expr)).setName(dlName) else: return (expr + ZeroOrMore(Suppress(delim) + expr)).setName(dlName) def countedArray(expr, intExpr=None): """Helper to define a counted list of expressions. This helper defines a pattern of the form:: integer expr expr expr... where the leading integer tells how many expr expressions follow. The matched tokens returns the array of expr tokens as a list - the leading count token is suppressed. If ``intExpr`` is specified, it should be a pyparsing expression that produces an integer value. Example:: countedArray(Word(alphas)).parseString('2 ab cd ef') # -> ['ab', 'cd'] # in this parser, the leading integer value is given in binary, # '10' indicating that 2 values are in the array binaryConstant = Word('01').setParseAction(lambda t: int(t[0], 2)) countedArray(Word(alphas), intExpr=binaryConstant).parseString('10 ab cd ef') # -> ['ab', 'cd'] """ arrayExpr = Forward() def countFieldParseAction(s, l, t): n = t[0] arrayExpr << (n and Group(And([expr] * n)) or Group(empty)) return [] if intExpr is None: intExpr = Word(nums).setParseAction(lambda t: int(t[0])) else: intExpr = intExpr.copy() intExpr.setName("arrayLen") intExpr.addParseAction(countFieldParseAction, callDuringTry=True) return (intExpr + arrayExpr).setName('(len) ' + _ustr(expr) + '...') def _flatten(L): ret = [] for i in L: if isinstance(i, list): ret.extend(_flatten(i)) else: ret.append(i) return ret def matchPreviousLiteral(expr): """Helper to define an expression that is indirectly defined from the tokens matched in a previous expression, that is, it looks for a 'repeat' of a previous expression. For example:: first = Word(nums) second = matchPreviousLiteral(first) matchExpr = first + ":" + second will match ``"1:1"``, but not ``"1:2"``. Because this matches a previous literal, will also match the leading ``"1:1"`` in ``"1:10"``. If this is not desired, use :class:`matchPreviousExpr`. Do *not* use with packrat parsing enabled. """ rep = Forward() def copyTokenToRepeater(s, l, t): if t: if len(t) == 1: rep << t[0] else: # flatten t tokens tflat = _flatten(t.asList()) rep << And(Literal(tt) for tt in tflat) else: rep << Empty() expr.addParseAction(copyTokenToRepeater, callDuringTry=True) rep.setName('(prev) ' + _ustr(expr)) return rep def matchPreviousExpr(expr): """Helper to define an expression that is indirectly defined from the tokens matched in a previous expression, that is, it looks for a 'repeat' of a previous expression. For example:: first = Word(nums) second = matchPreviousExpr(first) matchExpr = first + ":" + second will match ``"1:1"``, but not ``"1:2"``. Because this matches by expressions, will *not* match the leading ``"1:1"`` in ``"1:10"``; the expressions are evaluated first, and then compared, so ``"1"`` is compared with ``"10"``. Do *not* use with packrat parsing enabled. """ rep = Forward() e2 = expr.copy() rep <<= e2 def copyTokenToRepeater(s, l, t): matchTokens = _flatten(t.asList()) def mustMatchTheseTokens(s, l, t): theseTokens = _flatten(t.asList()) if theseTokens != matchTokens: raise ParseException('', 0, '') rep.setParseAction(mustMatchTheseTokens, callDuringTry=True) expr.addParseAction(copyTokenToRepeater, callDuringTry=True) rep.setName('(prev) ' + _ustr(expr)) return rep def _escapeRegexRangeChars(s): # ~ escape these chars: ^-[] for c in r"\^-[]": s = s.replace(c, _bslash + c) s = s.replace("\n", r"\n") s = s.replace("\t", r"\t") return _ustr(s) def oneOf(strs, caseless=False, useRegex=True, asKeyword=False): """Helper to quickly define a set of alternative Literals, and makes sure to do longest-first testing when there is a conflict, regardless of the input order, but returns a :class:`MatchFirst` for best performance. Parameters: - strs - a string of space-delimited literals, or a collection of string literals - caseless - (default= ``False``) - treat all literals as caseless - useRegex - (default= ``True``) - as an optimization, will generate a Regex object; otherwise, will generate a :class:`MatchFirst` object (if ``caseless=True`` or ``asKeyword=True``, or if creating a :class:`Regex` raises an exception) - asKeyword - (default=``False``) - enforce Keyword-style matching on the generated expressions Example:: comp_oper = oneOf("< = > <= >= !=") var = Word(alphas) number = Word(nums) term = var | number comparison_expr = term + comp_oper + term print(comparison_expr.searchString("B = 12 AA=23 B<=AA AA>12")) prints:: [['B', '=', '12'], ['AA', '=', '23'], ['B', '<=', 'AA'], ['AA', '>', '12']] """ if isinstance(caseless, basestring): warnings.warn("More than one string argument passed to oneOf, pass " "choices as a list or space-delimited string", stacklevel=2) if caseless: isequal = (lambda a, b: a.upper() == b.upper()) masks = (lambda a, b: b.upper().startswith(a.upper())) parseElementClass = CaselessKeyword if asKeyword else CaselessLiteral else: isequal = (lambda a, b: a == b) masks = (lambda a, b: b.startswith(a)) parseElementClass = Keyword if asKeyword else Literal symbols = [] if isinstance(strs, basestring): symbols = strs.split() elif isinstance(strs, Iterable): symbols = list(strs) else: warnings.warn("Invalid argument to oneOf, expected string or iterable", SyntaxWarning, stacklevel=2) if not symbols: return NoMatch() if not asKeyword: # if not producing keywords, need to reorder to take care to avoid masking # longer choices with shorter ones i = 0 while i < len(symbols) - 1: cur = symbols[i] for j, other in enumerate(symbols[i + 1:]): if isequal(other, cur): del symbols[i + j + 1] break elif masks(cur, other): del symbols[i + j + 1] symbols.insert(i, other) break else: i += 1 if not (caseless or asKeyword) and useRegex: # ~ print (strs, "->", "|".join([_escapeRegexChars(sym) for sym in symbols])) try: if len(symbols) == len("".join(symbols)): return Regex("[%s]" % "".join(_escapeRegexRangeChars(sym) for sym in symbols)).setName(' | '.join(symbols)) else: return Regex("|".join(re.escape(sym) for sym in symbols)).setName(' | '.join(symbols)) except Exception: warnings.warn("Exception creating Regex for oneOf, building MatchFirst", SyntaxWarning, stacklevel=2) # last resort, just use MatchFirst return MatchFirst(parseElementClass(sym) for sym in symbols).setName(' | '.join(symbols)) def dictOf(key, value): """Helper to easily and clearly define a dictionary by specifying the respective patterns for the key and value. Takes care of defining the :class:`Dict`, :class:`ZeroOrMore`, and :class:`Group` tokens in the proper order. The key pattern can include delimiting markers or punctuation, as long as they are suppressed, thereby leaving the significant key text. The value pattern can include named results, so that the :class:`Dict` results can include named token fields. Example:: text = "shape: SQUARE posn: upper left color: light blue texture: burlap" attr_expr = (label + Suppress(':') + OneOrMore(data_word, stopOn=label).setParseAction(' '.join)) print(OneOrMore(attr_expr).parseString(text).dump()) attr_label = label attr_value = Suppress(':') + OneOrMore(data_word, stopOn=label).setParseAction(' '.join) # similar to Dict, but simpler call format result = dictOf(attr_label, attr_value).parseString(text) print(result.dump()) print(result['shape']) print(result.shape) # object attribute access works too print(result.asDict()) prints:: [['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'light blue'], ['texture', 'burlap']] - color: light blue - posn: upper left - shape: SQUARE - texture: burlap SQUARE SQUARE {'color': 'light blue', 'shape': 'SQUARE', 'posn': 'upper left', 'texture': 'burlap'} """ return Dict(OneOrMore(Group(key + value))) def originalTextFor(expr, asString=True): """Helper to return the original, untokenized text for a given expression. Useful to restore the parsed fields of an HTML start tag into the raw tag text itself, or to revert separate tokens with intervening whitespace back to the original matching input text. By default, returns astring containing the original parsed text. If the optional ``asString`` argument is passed as ``False``, then the return value is a :class:`ParseResults` containing any results names that were originally matched, and a single token containing the original matched text from the input string. So if the expression passed to :class:`originalTextFor` contains expressions with defined results names, you must set ``asString`` to ``False`` if you want to preserve those results name values. Example:: src = "this is test bold text normal text " for tag in ("b", "i"): opener, closer = makeHTMLTags(tag) patt = originalTextFor(opener + SkipTo(closer) + closer) print(patt.searchString(src)[0]) prints:: [' bold text '] ['text'] """ locMarker = Empty().setParseAction(lambda s, loc, t: loc) endlocMarker = locMarker.copy() endlocMarker.callPreparse = False matchExpr = locMarker("_original_start") + expr + endlocMarker("_original_end") if asString: extractText = lambda s, l, t: s[t._original_start: t._original_end] else: def extractText(s, l, t): t[:] = [s[t.pop('_original_start'):t.pop('_original_end')]] matchExpr.setParseAction(extractText) matchExpr.ignoreExprs = expr.ignoreExprs return matchExpr def ungroup(expr): """Helper to undo pyparsing's default grouping of And expressions, even if all but one are non-empty. """ return TokenConverter(expr).addParseAction(lambda t: t[0]) def locatedExpr(expr): """Helper to decorate a returned token with its starting and ending locations in the input string. This helper adds the following results names: - locn_start = location where matched expression begins - locn_end = location where matched expression ends - value = the actual parsed results Be careful if the input text contains ```` characters, you may want to call :class:`ParserElement.parseWithTabs` Example:: wd = Word(alphas) for match in locatedExpr(wd).searchString("ljsdf123lksdjjf123lkkjj1222"): print(match) prints:: [[0, 'ljsdf', 5]] [[8, 'lksdjjf', 15]] [[18, 'lkkjj', 23]] """ locator = Empty().setParseAction(lambda s, l, t: l) return Group(locator("locn_start") + expr("value") + locator.copy().leaveWhitespace()("locn_end")) # convenience constants for positional expressions empty = Empty().setName("empty") lineStart = LineStart().setName("lineStart") lineEnd = LineEnd().setName("lineEnd") stringStart = StringStart().setName("stringStart") stringEnd = StringEnd().setName("stringEnd") _escapedPunc = Word(_bslash, r"\[]-*.$+^?()~ ", exact=2).setParseAction(lambda s, l, t: t[0][1]) _escapedHexChar = Regex(r"\\0?[xX][0-9a-fA-F]+").setParseAction(lambda s, l, t: unichr(int(t[0].lstrip(r'\0x'), 16))) _escapedOctChar = Regex(r"\\0[0-7]+").setParseAction(lambda s, l, t: unichr(int(t[0][1:], 8))) _singleChar = _escapedPunc | _escapedHexChar | _escapedOctChar | CharsNotIn(r'\]', exact=1) _charRange = Group(_singleChar + Suppress("-") + _singleChar) _reBracketExpr = Literal("[") + Optional("^").setResultsName("negate") + Group(OneOrMore(_charRange | _singleChar)).setResultsName("body") + "]" def srange(s): r"""Helper to easily define string ranges for use in Word construction. Borrows syntax from regexp '[]' string range definitions:: srange("[0-9]") -> "0123456789" srange("[a-z]") -> "abcdefghijklmnopqrstuvwxyz" srange("[a-z$_]") -> "abcdefghijklmnopqrstuvwxyz$_" The input string must be enclosed in []'s, and the returned string is the expanded character set joined into a single string. The values enclosed in the []'s may be: - a single character - an escaped character with a leading backslash (such as ``\-`` or ``\]``) - an escaped hex character with a leading ``'\x'`` (``\x21``, which is a ``'!'`` character) (``\0x##`` is also supported for backwards compatibility) - an escaped octal character with a leading ``'\0'`` (``\041``, which is a ``'!'`` character) - a range of any of the above, separated by a dash (``'a-z'``, etc.) - any combination of the above (``'aeiouy'``, ``'a-zA-Z0-9_$'``, etc.) """ _expanded = lambda p: p if not isinstance(p, ParseResults) else ''.join(unichr(c) for c in range(ord(p[0]), ord(p[1]) + 1)) try: return "".join(_expanded(part) for part in _reBracketExpr.parseString(s).body) except Exception: return "" def matchOnlyAtCol(n): """Helper method for defining parse actions that require matching at a specific column in the input text. """ def verifyCol(strg, locn, toks): if col(locn, strg) != n: raise ParseException(strg, locn, "matched token not at column %d" % n) return verifyCol def replaceWith(replStr): """Helper method for common parse actions that simply return a literal value. Especially useful when used with :class:`transformString` (). Example:: num = Word(nums).setParseAction(lambda toks: int(toks[0])) na = oneOf("N/A NA").setParseAction(replaceWith(math.nan)) term = na | num OneOrMore(term).parseString("324 234 N/A 234") # -> [324, 234, nan, 234] """ return lambda s, l, t: [replStr] def removeQuotes(s, l, t): """Helper parse action for removing quotation marks from parsed quoted strings. Example:: # by default, quotation marks are included in parsed results quotedString.parseString("'Now is the Winter of our Discontent'") # -> ["'Now is the Winter of our Discontent'"] # use removeQuotes to strip quotation marks from parsed results quotedString.setParseAction(removeQuotes) quotedString.parseString("'Now is the Winter of our Discontent'") # -> ["Now is the Winter of our Discontent"] """ return t[0][1:-1] def tokenMap(func, *args): """Helper to define a parse action by mapping a function to all elements of a ParseResults list. If any additional args are passed, they are forwarded to the given function as additional arguments after the token, as in ``hex_integer = Word(hexnums).setParseAction(tokenMap(int, 16))``, which will convert the parsed data to an integer using base 16. Example (compare the last to example in :class:`ParserElement.transformString`:: hex_ints = OneOrMore(Word(hexnums)).setParseAction(tokenMap(int, 16)) hex_ints.runTests(''' 00 11 22 aa FF 0a 0d 1a ''') upperword = Word(alphas).setParseAction(tokenMap(str.upper)) OneOrMore(upperword).runTests(''' my kingdom for a horse ''') wd = Word(alphas).setParseAction(tokenMap(str.title)) OneOrMore(wd).setParseAction(' '.join).runTests(''' now is the winter of our discontent made glorious summer by this sun of york ''') prints:: 00 11 22 aa FF 0a 0d 1a [0, 17, 34, 170, 255, 10, 13, 26] my kingdom for a horse ['MY', 'KINGDOM', 'FOR', 'A', 'HORSE'] now is the winter of our discontent made glorious summer by this sun of york ['Now Is The Winter Of Our Discontent Made Glorious Summer By This Sun Of York'] """ def pa(s, l, t): return [func(tokn, *args) for tokn in t] try: func_name = getattr(func, '__name__', getattr(func, '__class__').__name__) except Exception: func_name = str(func) pa.__name__ = func_name return pa upcaseTokens = tokenMap(lambda t: _ustr(t).upper()) """(Deprecated) Helper parse action to convert tokens to upper case. Deprecated in favor of :class:`pyparsing_common.upcaseTokens`""" downcaseTokens = tokenMap(lambda t: _ustr(t).lower()) """(Deprecated) Helper parse action to convert tokens to lower case. Deprecated in favor of :class:`pyparsing_common.downcaseTokens`""" def _makeTags(tagStr, xml, suppress_LT=Suppress("<"), suppress_GT=Suppress(">")): """Internal helper to construct opening and closing tag expressions, given a tag name""" if isinstance(tagStr, basestring): resname = tagStr tagStr = Keyword(tagStr, caseless=not xml) else: resname = tagStr.name tagAttrName = Word(alphas, alphanums + "_-:") if xml: tagAttrValue = dblQuotedString.copy().setParseAction(removeQuotes) openTag = (suppress_LT + tagStr("tag") + Dict(ZeroOrMore(Group(tagAttrName + Suppress("=") + tagAttrValue))) + Optional("/", default=[False])("empty").setParseAction(lambda s, l, t: t[0] == '/') + suppress_GT) else: tagAttrValue = quotedString.copy().setParseAction(removeQuotes) | Word(printables, excludeChars=">") openTag = (suppress_LT + tagStr("tag") + Dict(ZeroOrMore(Group(tagAttrName.setParseAction(downcaseTokens) + Optional(Suppress("=") + tagAttrValue)))) + Optional("/", default=[False])("empty").setParseAction(lambda s, l, t: t[0] == '/') + suppress_GT) closeTag = Combine(_L("", adjacent=False) openTag.setName("<%s>" % resname) # add start results name in parse action now that ungrouped names are not reported at two levels openTag.addParseAction(lambda t: t.__setitem__("start" + "".join(resname.replace(":", " ").title().split()), t.copy())) closeTag = closeTag("end" + "".join(resname.replace(":", " ").title().split())).setName("" % resname) openTag.tag = resname closeTag.tag = resname openTag.tag_body = SkipTo(closeTag()) return openTag, closeTag def makeHTMLTags(tagStr): """Helper to construct opening and closing tag expressions for HTML, given a tag name. Matches tags in either upper or lower case, attributes with namespaces and with quoted or unquoted values. Example:: text = 'More info at the pyparsing wiki page' # makeHTMLTags returns pyparsing expressions for the opening and # closing tags as a 2-tuple a, a_end = makeHTMLTags("A") link_expr = a + SkipTo(a_end)("link_text") + a_end for link in link_expr.searchString(text): # attributes in the tag (like "href" shown here) are # also accessible as named results print(link.link_text, '->', link.href) prints:: pyparsing -> https://github.com/pyparsing/pyparsing/wiki """ return _makeTags(tagStr, False) def makeXMLTags(tagStr): """Helper to construct opening and closing tag expressions for XML, given a tag name. Matches tags only in the given upper/lower case. Example: similar to :class:`makeHTMLTags` """ return _makeTags(tagStr, True) def withAttribute(*args, **attrDict): """Helper to create a validating parse action to be used with start tags created with :class:`makeXMLTags` or :class:`makeHTMLTags`. Use ``withAttribute`` to qualify a starting tag with a required attribute value, to avoid false matches on common tags such as ```` or ``

``. Call ``withAttribute`` with a series of attribute names and values. Specify the list of filter attributes names and values as: - keyword arguments, as in ``(align="right")``, or - as an explicit dict with ``**`` operator, when an attribute name is also a Python reserved word, as in ``**{"class":"Customer", "align":"right"}`` - a list of name-value tuples, as in ``(("ns1:class", "Customer"), ("ns2:align", "right"))`` For attribute names with a namespace prefix, you must use the second form. Attribute names are matched insensitive to upper/lower case. If just testing for ``class`` (with or without a namespace), use :class:`withClass`. To verify that the attribute exists, but without specifying a value, pass ``withAttribute.ANY_VALUE`` as the value. Example:: html = '''

1 4 0 1 0

1,3 2,3 1,1

this has no type

''' div,div_end = makeHTMLTags("div") # only match div tag having a type attribute with value "grid" div_grid = div().setParseAction(withAttribute(type="grid")) grid_expr = div_grid + SkipTo(div | div_end)("body") for grid_header in grid_expr.searchString(html): print(grid_header.body) # construct a match with any div tag having a type attribute, regardless of the value div_any_type = div().setParseAction(withAttribute(type=withAttribute.ANY_VALUE)) div_expr = div_any_type + SkipTo(div | div_end)("body") for div_header in div_expr.searchString(html): print(div_header.body) prints:: 1 4 0 1 0 1 4 0 1 0 1,3 2,3 1,1 """ if args: attrs = args[:] else: attrs = attrDict.items() attrs = [(k, v) for k, v in attrs] def pa(s, l, tokens): for attrName, attrValue in attrs: if attrName not in tokens: raise ParseException(s, l, "no matching attribute " + attrName) if attrValue != withAttribute.ANY_VALUE and tokens[attrName] != attrValue: raise ParseException(s, l, "attribute '%s' has value '%s', must be '%s'" % (attrName, tokens[attrName], attrValue)) return pa withAttribute.ANY_VALUE = object() def withClass(classname, namespace=''): """Simplified version of :class:`withAttribute` when matching on a div class - made difficult because ``class`` is a reserved word in Python. Example:: html = '''

Some text

1 4 0 1 0

1,3 2,3 1,1

this <div> has no class