You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
141 lines
5.1 KiB
Python
141 lines
5.1 KiB
Python
# Natural Language Toolkit: Tokenizers
|
|
#
|
|
# Copyright (C) 2001-2019 NLTK Project
|
|
# Author: Yoav Goldberg <yoavg@cs.bgu.ac.il>
|
|
# Steven Bird <stevenbird1@gmail.com> (minor edits)
|
|
# URL: <http://nltk.sourceforge.net>
|
|
# For license information, see LICENSE.TXT
|
|
|
|
"""
|
|
S-Expression Tokenizer
|
|
|
|
``SExprTokenizer`` is used to find parenthesized expressions in a
|
|
string. In particular, it divides a string into a sequence of
|
|
substrings that are either parenthesized expressions (including any
|
|
nested parenthesized expressions), or other whitespace-separated
|
|
tokens.
|
|
|
|
>>> from nltk.tokenize import SExprTokenizer
|
|
>>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
|
|
['(a b (c d))', 'e', 'f', '(g)']
|
|
|
|
By default, `SExprTokenizer` will raise a ``ValueError`` exception if
|
|
used to tokenize an expression with non-matching parentheses:
|
|
|
|
>>> SExprTokenizer().tokenize('c) d) e (f (g')
|
|
Traceback (most recent call last):
|
|
...
|
|
ValueError: Un-matched close paren at char 1
|
|
|
|
The ``strict`` argument can be set to False to allow for
|
|
non-matching parentheses. Any unmatched close parentheses will be
|
|
listed as their own s-expression; and the last partial sexpr with
|
|
unmatched open parentheses will be listed as its own sexpr:
|
|
|
|
>>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
|
|
['c', ')', 'd', ')', 'e', '(f (g']
|
|
|
|
The characters used for open and close parentheses may be customized
|
|
using the ``parens`` argument to the `SExprTokenizer` constructor:
|
|
|
|
>>> SExprTokenizer(parens='{}').tokenize('{a b {c d}} e f {g}')
|
|
['{a b {c d}}', 'e', 'f', '{g}']
|
|
|
|
The s-expression tokenizer is also available as a function:
|
|
|
|
>>> from nltk.tokenize import sexpr_tokenize
|
|
>>> sexpr_tokenize('(a b (c d)) e f (g)')
|
|
['(a b (c d))', 'e', 'f', '(g)']
|
|
|
|
"""
|
|
|
|
import re
|
|
|
|
from nltk.tokenize.api import TokenizerI
|
|
|
|
|
|
class SExprTokenizer(TokenizerI):
|
|
"""
|
|
A tokenizer that divides strings into s-expressions.
|
|
An s-expresion can be either:
|
|
|
|
- a parenthesized expression, including any nested parenthesized
|
|
expressions, or
|
|
- a sequence of non-whitespace non-parenthesis characters.
|
|
|
|
For example, the string ``(a (b c)) d e (f)`` consists of four
|
|
s-expressions: ``(a (b c))``, ``d``, ``e``, and ``(f)``.
|
|
|
|
By default, the characters ``(`` and ``)`` are treated as open and
|
|
close parentheses, but alternative strings may be specified.
|
|
|
|
:param parens: A two-element sequence specifying the open and close parentheses
|
|
that should be used to find sexprs. This will typically be either a
|
|
two-character string, or a list of two strings.
|
|
:type parens: str or list
|
|
:param strict: If true, then raise an exception when tokenizing an ill-formed sexpr.
|
|
"""
|
|
|
|
def __init__(self, parens='()', strict=True):
|
|
if len(parens) != 2:
|
|
raise ValueError('parens must contain exactly two strings')
|
|
self._strict = strict
|
|
self._open_paren = parens[0]
|
|
self._close_paren = parens[1]
|
|
self._paren_regexp = re.compile(
|
|
'%s|%s' % (re.escape(parens[0]), re.escape(parens[1]))
|
|
)
|
|
|
|
def tokenize(self, text):
|
|
"""
|
|
Return a list of s-expressions extracted from *text*.
|
|
For example:
|
|
|
|
>>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
|
|
['(a b (c d))', 'e', 'f', '(g)']
|
|
|
|
All parentheses are assumed to mark s-expressions.
|
|
(No special processing is done to exclude parentheses that occur
|
|
inside strings, or following backslash characters.)
|
|
|
|
If the given expression contains non-matching parentheses,
|
|
then the behavior of the tokenizer depends on the ``strict``
|
|
parameter to the constructor. If ``strict`` is ``True``, then
|
|
raise a ``ValueError``. If ``strict`` is ``False``, then any
|
|
unmatched close parentheses will be listed as their own
|
|
s-expression; and the last partial s-expression with unmatched open
|
|
parentheses will be listed as its own s-expression:
|
|
|
|
>>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
|
|
['c', ')', 'd', ')', 'e', '(f (g']
|
|
|
|
:param text: the string to be tokenized
|
|
:type text: str or iter(str)
|
|
:rtype: iter(str)
|
|
"""
|
|
result = []
|
|
pos = 0
|
|
depth = 0
|
|
for m in self._paren_regexp.finditer(text):
|
|
paren = m.group()
|
|
if depth == 0:
|
|
result += text[pos : m.start()].split()
|
|
pos = m.start()
|
|
if paren == self._open_paren:
|
|
depth += 1
|
|
if paren == self._close_paren:
|
|
if self._strict and depth == 0:
|
|
raise ValueError('Un-matched close paren at char %d' % m.start())
|
|
depth = max(0, depth - 1)
|
|
if depth == 0:
|
|
result.append(text[pos : m.end()])
|
|
pos = m.end()
|
|
if self._strict and depth > 0:
|
|
raise ValueError('Un-matched open paren at char %d' % pos)
|
|
if pos < len(text):
|
|
result.append(text[pos:])
|
|
return result
|
|
|
|
|
|
sexpr_tokenize = SExprTokenizer().tokenize
|