You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
308 lines
9.8 KiB
Plaintext
308 lines
9.8 KiB
Plaintext
.. Copyright (C) 2001-2020 NLTK Project
|
|
.. For license information, see LICENSE.TXT
|
|
|
|
===============================
|
|
Unit test cases for ``toolbox``
|
|
===============================
|
|
|
|
>>> from nltk import toolbox
|
|
|
|
--------------------------
|
|
``toolbox.StandardFormat``
|
|
--------------------------
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
|
|
``toolbox.StandardFormat.open()``
|
|
---------------------------------
|
|
>>> import os, tempfile
|
|
>>> (fd, fname) = tempfile.mkstemp()
|
|
>>> tf = os.fdopen(fd, "w")
|
|
>>> _ = tf.write('\\lx a value\n\\lx another value\n')
|
|
>>> tf.close()
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open(fname)
|
|
>>> list(f.fields())
|
|
[('lx', 'a value'), ('lx', 'another value')]
|
|
>>> f.close()
|
|
>>> os.unlink(fname)
|
|
|
|
``toolbox.StandardFormat.open_string()``
|
|
----------------------------------------
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\\lx a value\n\\lx another value\n')
|
|
>>> list(f.fields())
|
|
[('lx', 'a value'), ('lx', 'another value')]
|
|
>>> f.close()
|
|
|
|
``toolbox.StandardFormat.close()``
|
|
----------------------------------
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\\lx a value\n\\lx another value\n')
|
|
>>> list(f.fields())
|
|
[('lx', 'a value'), ('lx', 'another value')]
|
|
>>> f.close()
|
|
|
|
``toolbox.StandardFormat.line_num``
|
|
---------------------------------------
|
|
|
|
``StandardFormat.line_num`` contains the line number of the last line returned:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\\lx a value\n\\lx another value\n\\lx a third value\n')
|
|
>>> line_nums = []
|
|
>>> for l in f.raw_fields():
|
|
... line_nums.append(f.line_num)
|
|
>>> line_nums
|
|
[1, 2, 3]
|
|
|
|
``StandardFormat.line_num`` contains the line number of the last line returned:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\\lx two\nlines\n\\lx three\nlines\n\n\\lx two\nlines\n')
|
|
>>> line_nums = []
|
|
>>> for l in f.raw_fields():
|
|
... line_nums.append(f.line_num)
|
|
>>> line_nums
|
|
[2, 5, 7]
|
|
|
|
``StandardFormat.line_num`` doesn't exist before openning or after closing
|
|
a file or string:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.line_num
|
|
Traceback (most recent call last):
|
|
...
|
|
AttributeError: 'StandardFormat' object has no attribute 'line_num'
|
|
>>> f.open_string('\\lx two\nlines\n\\lx three\nlines\n\n\\lx two\nlines\n')
|
|
>>> line_nums = []
|
|
>>> for l in f.raw_fields():
|
|
... line_nums.append(f.line_num)
|
|
>>> line_nums
|
|
[2, 5, 7]
|
|
>>> f.close()
|
|
>>> f.line_num
|
|
Traceback (most recent call last):
|
|
...
|
|
AttributeError: 'StandardFormat' object has no attribute 'line_num'
|
|
|
|
``toolbox.StandardFormat.raw_fields()``
|
|
---------------------------------------
|
|
``raw_fields()`` returns an iterator over tuples of two strings representing the
|
|
marker and its value. The marker is given without the backslash and the value
|
|
without its trailing newline:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\\lx a value\n\\lx another value\n')
|
|
>>> list(f.raw_fields())
|
|
[('lx', 'a value'), ('lx', 'another value')]
|
|
|
|
an empty file returns nothing:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('')
|
|
>>> list(f.raw_fields())
|
|
[]
|
|
|
|
file with only a newline returns WHAT SHOULD IT RETURN???:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\n')
|
|
>>> list(f.raw_fields())
|
|
[(None, '')]
|
|
|
|
file with only one field should be parsed ok:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\\lx one value\n')
|
|
>>> list(f.raw_fields())
|
|
[('lx', 'one value')]
|
|
|
|
file without a trailing newline should be parsed ok:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\\lx a value\n\\lx another value')
|
|
>>> list(f.raw_fields())
|
|
[('lx', 'a value'), ('lx', 'another value')]
|
|
|
|
trailing white space is preserved except for the final newline:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\\lx trailing space \n\\lx trailing tab\t\n\\lx extra newline\n\n')
|
|
>>> list(f.raw_fields())
|
|
[('lx', 'trailing space '), ('lx', 'trailing tab\t'), ('lx', 'extra newline\n')]
|
|
|
|
line wrapping is preserved:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n')
|
|
>>> list(f.raw_fields())
|
|
[('lx', 'a value\nmore of the value\nand still more'), ('lc', 'another val')]
|
|
|
|
file beginning with a multiline record should be parsed ok:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n')
|
|
>>> list(f.raw_fields())
|
|
[('lx', 'a value\nmore of the value\nand still more'), ('lc', 'another val')]
|
|
|
|
file ending with a multiline record should be parsed ok:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\\lc a value\n\\lx another value\nmore of the value\nand still more\n')
|
|
>>> list(f.raw_fields())
|
|
[('lc', 'a value'), ('lx', 'another value\nmore of the value\nand still more')]
|
|
|
|
file beginning with a BOM should be parsed ok:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\xef\xbb\xbf\\lx a value\n\\lx another value\n')
|
|
>>> list(f.raw_fields())
|
|
[('lx', 'a value'), ('lx', 'another value')]
|
|
|
|
file beginning with two BOMs should ignore only the first one:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\xef\xbb\xbf\xef\xbb\xbf\\lx a value\n\\lx another value\n')
|
|
>>> list(f.raw_fields())
|
|
[(None, '\xef\xbb\xbf\\lx a value'), ('lx', 'another value')]
|
|
|
|
should not ignore a BOM not at the beginning of the file:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\\lx a value\n\xef\xbb\xbf\\lx another value\n')
|
|
>>> list(f.raw_fields())
|
|
[('lx', 'a value\n\xef\xbb\xbf\\lx another value')]
|
|
|
|
``toolbox.StandardFormat.fields()``
|
|
-----------------------------------
|
|
trailing white space is not preserved:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\\lx trailing space \n\\lx trailing tab\t\n\\lx extra newline\n\n')
|
|
>>> list(f.fields())
|
|
[('lx', 'trailing space'), ('lx', 'trailing tab'), ('lx', 'extra newline')]
|
|
|
|
multiline fields are unwrapped:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n')
|
|
>>> list(f.fields())
|
|
[('lx', 'a value more of the value and still more'), ('lc', 'another val')]
|
|
|
|
markers
|
|
-------
|
|
A backslash in the first position on a new line indicates the start of a
|
|
marker. The backslash is not part of the marker:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\\mk a value\n')
|
|
>>> list(f.fields())
|
|
[('mk', 'a value')]
|
|
|
|
If the backslash occurs later in the line it does not indicate the start
|
|
of a marker:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\\mk a value\n \\mk another one\n')
|
|
>>> list(f.raw_fields())
|
|
[('mk', 'a value\n \\mk another one')]
|
|
|
|
There is no specific limit to the length of a marker:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\\this_is_an_extremely_long_marker value\n')
|
|
>>> list(f.fields())
|
|
[('this_is_an_extremely_long_marker', 'value')]
|
|
|
|
A marker can contain any non white space character:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\\`~!@#$%^&*()_-=+[{]}\|,<.>/?;:"0123456789 value\n')
|
|
>>> list(f.fields())
|
|
[('`~!@#$%^&*()_-=+[{]}\\|,<.>/?;:"0123456789', 'value')]
|
|
|
|
A marker is terminated by any white space character:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\\mk a value\n\\mk\tanother one\n\\mk\rthird one\n\\mk\ffourth one')
|
|
>>> list(f.fields())
|
|
[('mk', 'a value'), ('mk', 'another one'), ('mk', 'third one'), ('mk', 'fourth one')]
|
|
|
|
Consecutive whitespace characters (except newline) are treated the same as one:
|
|
|
|
>>> f = toolbox.StandardFormat()
|
|
>>> f.open_string('\\mk \t\r\fa value\n')
|
|
>>> list(f.fields())
|
|
[('mk', 'a value')]
|
|
|
|
-----------------------
|
|
``toolbox.ToolboxData``
|
|
-----------------------
|
|
|
|
>>> db = toolbox.ToolboxData()
|
|
|
|
``toolbox.ToolboxData.parse()``
|
|
-------------------------------
|
|
check that normal parsing works:
|
|
|
|
>>> from xml.etree import ElementTree
|
|
>>> td = toolbox.ToolboxData()
|
|
>>> s = """\\_sh v3.0 400 Rotokas Dictionary
|
|
... \\_DateStampHasFourDigitYear
|
|
...
|
|
... \\lx kaa
|
|
... \\ps V.A
|
|
... \\ge gag
|
|
... \\gp nek i pas
|
|
...
|
|
... \\lx kaa
|
|
... \\ps V.B
|
|
... \\ge strangle
|
|
... \\gp pasim nek
|
|
... """
|
|
>>> td.open_string(s)
|
|
>>> tree = td.parse(key='lx')
|
|
>>> tree.tag
|
|
'toolbox_data'
|
|
>>> ElementTree.tostring(list(tree)[0]).decode('utf8')
|
|
'<header><_sh>v3.0 400 Rotokas Dictionary</_sh><_DateStampHasFourDigitYear /></header>'
|
|
>>> ElementTree.tostring(list(tree)[1]).decode('utf8')
|
|
'<record><lx>kaa</lx><ps>V.A</ps><ge>gag</ge><gp>nek i pas</gp></record>'
|
|
>>> ElementTree.tostring(list(tree)[2]).decode('utf8')
|
|
'<record><lx>kaa</lx><ps>V.B</ps><ge>strangle</ge><gp>pasim nek</gp></record>'
|
|
|
|
check that guessing the key marker works:
|
|
|
|
>>> from xml.etree import ElementTree
|
|
>>> td = toolbox.ToolboxData()
|
|
>>> s = """\\_sh v3.0 400 Rotokas Dictionary
|
|
... \\_DateStampHasFourDigitYear
|
|
...
|
|
... \\lx kaa
|
|
... \\ps V.A
|
|
... \\ge gag
|
|
... \\gp nek i pas
|
|
...
|
|
... \\lx kaa
|
|
... \\ps V.B
|
|
... \\ge strangle
|
|
... \\gp pasim nek
|
|
... """
|
|
>>> td.open_string(s)
|
|
>>> tree = td.parse()
|
|
>>> ElementTree.tostring(list(tree)[0]).decode('utf8')
|
|
'<header><_sh>v3.0 400 Rotokas Dictionary</_sh><_DateStampHasFourDigitYear /></header>'
|
|
>>> ElementTree.tostring(list(tree)[1]).decode('utf8')
|
|
'<record><lx>kaa</lx><ps>V.A</ps><ge>gag</ge><gp>nek i pas</gp></record>'
|
|
>>> ElementTree.tostring(list(tree)[2]).decode('utf8')
|
|
'<record><lx>kaa</lx><ps>V.B</ps><ge>strangle</ge><gp>pasim nek</gp></record>'
|
|
|
|
-----------------------
|
|
``toolbox`` functions
|
|
-----------------------
|
|
|
|
``toolbox.to_sfm_string()``
|
|
-------------------------------
|
|
|