You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
323 lines
12 KiB
Python
323 lines
12 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Natural Language Toolkit: Transformation-based learning
|
|
#
|
|
# Copyright (C) 2001-2019 NLTK Project
|
|
# Author: Marcus Uneson <marcus.uneson@gmail.com>
|
|
# based on previous (nltk2) version by
|
|
# Christopher Maloof, Edward Loper, Steven Bird
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
|
|
from __future__ import print_function
|
|
from abc import ABCMeta, abstractmethod
|
|
from six import add_metaclass
|
|
import itertools as it
|
|
from nltk.tbl.feature import Feature
|
|
from nltk.tbl.rule import Rule
|
|
|
|
|
|
@add_metaclass(ABCMeta)
|
|
class BrillTemplateI(object):
|
|
"""
|
|
An interface for generating lists of transformational rules that
|
|
apply at given sentence positions. ``BrillTemplateI`` is used by
|
|
``Brill`` training algorithms to generate candidate rules.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def applicable_rules(self, tokens, i, correctTag):
|
|
"""
|
|
Return a list of the transformational rules that would correct
|
|
the *i*th subtoken's tag in the given token. In particular,
|
|
return a list of zero or more rules that would change
|
|
*tokens*[i][1] to *correctTag*, if applied to *token*[i].
|
|
|
|
If the *i*th token already has the correct tag (i.e., if
|
|
tagged_tokens[i][1] == correctTag), then
|
|
``applicable_rules()`` should return the empty list.
|
|
|
|
:param tokens: The tagged tokens being tagged.
|
|
:type tokens: list(tuple)
|
|
:param i: The index of the token whose tag should be corrected.
|
|
:type i: int
|
|
:param correctTag: The correct tag for the *i*th token.
|
|
:type correctTag: any
|
|
:rtype: list(BrillRule)
|
|
"""
|
|
|
|
@abstractmethod
|
|
def get_neighborhood(self, token, index):
|
|
"""
|
|
Returns the set of indices *i* such that
|
|
``applicable_rules(token, i, ...)`` depends on the value of
|
|
the *index*th token of *token*.
|
|
|
|
This method is used by the "fast" Brill tagger trainer.
|
|
|
|
:param token: The tokens being tagged.
|
|
:type token: list(tuple)
|
|
:param index: The index whose neighborhood should be returned.
|
|
:type index: int
|
|
:rtype: set
|
|
"""
|
|
|
|
|
|
class Template(BrillTemplateI):
|
|
"""
|
|
A tbl Template that generates a list of L{Rule}s that apply at a given sentence
|
|
position. In particular, each C{Template} is parameterized by a list of
|
|
independent features (a combination of a specific
|
|
property to extract and a list C{L} of relative positions at which to extract
|
|
it) and generates all Rules that:
|
|
|
|
- use the given features, each at its own independent position; and
|
|
- are applicable to the given token.
|
|
"""
|
|
|
|
ALLTEMPLATES = []
|
|
# record a unique id of form "001", for each template created
|
|
# _ids = it.count(0)
|
|
|
|
def __init__(self, *features):
|
|
|
|
"""
|
|
Construct a Template for generating Rules.
|
|
|
|
Takes a list of Features. A C{Feature} is a combination
|
|
of a specific property and its relative positions and should be
|
|
a subclass of L{nltk.tbl.feature.Feature}.
|
|
|
|
An alternative calling convention (kept for backwards compatibility,
|
|
but less expressive as it only permits one feature type) is
|
|
Template(Feature, (start1, end1), (start2, end2), ...)
|
|
In new code, that would be better written
|
|
Template(Feature(start1, end1), Feature(start2, end2), ...)
|
|
|
|
#For instance, importing some features
|
|
>>> from nltk.tbl.template import Template
|
|
>>> from nltk.tag.brill import Word, Pos
|
|
|
|
#create some features
|
|
|
|
>>> wfeat1, wfeat2, pfeat = (Word([-1]), Word([1,2]), Pos([-2,-1]))
|
|
|
|
#Create a single-feature template
|
|
>>> Template(wfeat1)
|
|
Template(Word([-1]))
|
|
|
|
#or a two-feature one
|
|
>>> Template(wfeat1, wfeat2)
|
|
Template(Word([-1]),Word([1, 2]))
|
|
|
|
#or a three-feature one with two different feature types
|
|
>>> Template(wfeat1, wfeat2, pfeat)
|
|
Template(Word([-1]),Word([1, 2]),Pos([-2, -1]))
|
|
|
|
#deprecated api: Feature subclass, followed by list of (start,end) pairs
|
|
#(permits only a single Feature)
|
|
>>> Template(Word, (-2,-1), (0,0))
|
|
Template(Word([-2, -1]),Word([0]))
|
|
|
|
#incorrect specification raises TypeError
|
|
>>> Template(Word, (-2,-1), Pos, (0,0))
|
|
Traceback (most recent call last):
|
|
File "<stdin>", line 1, in <module>
|
|
File "nltk/tag/tbl/template.py", line 143, in __init__
|
|
raise TypeError(
|
|
TypeError: expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ...
|
|
|
|
:type features: list of Features
|
|
:param features: the features to build this Template on
|
|
"""
|
|
# determine the calling form: either
|
|
# Template(Feature, args1, [args2, ...)]
|
|
# Template(Feature1(args), Feature2(args), ...)
|
|
if all(isinstance(f, Feature) for f in features):
|
|
self._features = features
|
|
elif issubclass(features[0], Feature) and all(
|
|
isinstance(a, tuple) for a in features[1:]
|
|
):
|
|
self._features = [features[0](*tp) for tp in features[1:]]
|
|
else:
|
|
raise TypeError(
|
|
"expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ..."
|
|
)
|
|
self.id = "{0:03d}".format(len(self.ALLTEMPLATES))
|
|
self.ALLTEMPLATES.append(self)
|
|
|
|
def __repr__(self):
|
|
return "%s(%s)" % (
|
|
self.__class__.__name__,
|
|
",".join([str(f) for f in self._features]),
|
|
)
|
|
|
|
def applicable_rules(self, tokens, index, correct_tag):
|
|
if tokens[index][1] == correct_tag:
|
|
return []
|
|
|
|
# For each of this Template's features, find the conditions
|
|
# that are applicable for the given token.
|
|
# Then, generate one Rule for each combination of features
|
|
# (the crossproduct of the conditions).
|
|
|
|
applicable_conditions = self._applicable_conditions(tokens, index)
|
|
xs = list(it.product(*applicable_conditions))
|
|
return [Rule(self.id, tokens[index][1], correct_tag, tuple(x)) for x in xs]
|
|
|
|
def _applicable_conditions(self, tokens, index):
|
|
"""
|
|
:returns: A set of all conditions for rules
|
|
that are applicable to C{tokens[index]}.
|
|
"""
|
|
conditions = []
|
|
|
|
for feature in self._features:
|
|
conditions.append([])
|
|
for pos in feature.positions:
|
|
if not (0 <= index + pos < len(tokens)):
|
|
continue
|
|
value = feature.extract_property(tokens, index + pos)
|
|
conditions[-1].append((feature, value))
|
|
return conditions
|
|
|
|
def get_neighborhood(self, tokens, index):
|
|
# inherit docs from BrillTemplateI
|
|
|
|
# applicable_rules(tokens, index, ...) depends on index.
|
|
neighborhood = set([index]) # set literal for python 2.7+
|
|
|
|
# applicable_rules(tokens, i, ...) depends on index if
|
|
# i+start < index <= i+end.
|
|
|
|
allpositions = [0] + [p for feat in self._features for p in feat.positions]
|
|
start, end = min(allpositions), max(allpositions)
|
|
s = max(0, index + (-end))
|
|
e = min(index + (-start) + 1, len(tokens))
|
|
for i in range(s, e):
|
|
neighborhood.add(i)
|
|
return neighborhood
|
|
|
|
@classmethod
|
|
def expand(cls, featurelists, combinations=None, skipintersecting=True):
|
|
|
|
"""
|
|
Factory method to mass generate Templates from a list L of lists of Features.
|
|
|
|
#With combinations=(k1, k2), the function will in all possible ways choose k1 ... k2
|
|
#of the sublists in L; it will output all Templates formed by the Cartesian product
|
|
#of this selection, with duplicates and other semantically equivalent
|
|
#forms removed. Default for combinations is (1, len(L)).
|
|
|
|
The feature lists may have been specified
|
|
manually, or generated from Feature.expand(). For instance,
|
|
|
|
>>> from nltk.tbl.template import Template
|
|
>>> from nltk.tag.brill import Word, Pos
|
|
|
|
#creating some features
|
|
>>> (wd_0, wd_01) = (Word([0]), Word([0,1]))
|
|
|
|
>>> (pos_m2, pos_m33) = (Pos([-2]), Pos([3-2,-1,0,1,2,3]))
|
|
|
|
>>> list(Template.expand([[wd_0], [pos_m2]]))
|
|
[Template(Word([0])), Template(Pos([-2])), Template(Pos([-2]),Word([0]))]
|
|
|
|
>>> list(Template.expand([[wd_0, wd_01], [pos_m2]]))
|
|
[Template(Word([0])), Template(Word([0, 1])), Template(Pos([-2])), Template(Pos([-2]),Word([0])), Template(Pos([-2]),Word([0, 1]))]
|
|
|
|
#note: with Feature.expand(), it is very easy to generate more templates
|
|
#than your system can handle -- for instance,
|
|
>>> wordtpls = Word.expand([-2,-1,0,1], [1,2], excludezero=False)
|
|
>>> len(wordtpls)
|
|
7
|
|
|
|
>>> postpls = Pos.expand([-3,-2,-1,0,1,2], [1,2,3], excludezero=True)
|
|
>>> len(postpls)
|
|
9
|
|
|
|
#and now the Cartesian product of all non-empty combinations of two wordtpls and
|
|
#two postpls, with semantic equivalents removed
|
|
>>> templates = list(Template.expand([wordtpls, wordtpls, postpls, postpls]))
|
|
>>> len(templates)
|
|
713
|
|
|
|
|
|
will return a list of eight templates
|
|
Template(Word([0])),
|
|
Template(Word([0, 1])),
|
|
Template(Pos([-2])),
|
|
Template(Pos([-1])),
|
|
Template(Pos([-2]),Word([0])),
|
|
Template(Pos([-1]),Word([0])),
|
|
Template(Pos([-2]),Word([0, 1])),
|
|
Template(Pos([-1]),Word([0, 1]))]
|
|
|
|
|
|
#Templates where one feature is a subset of another, such as
|
|
#Template(Word([0,1]), Word([1]), will not appear in the output.
|
|
#By default, this non-subset constraint is tightened to disjointness:
|
|
#Templates of type Template(Word([0,1]), Word([1,2]) will also be filtered out.
|
|
#With skipintersecting=False, then such Templates are allowed
|
|
|
|
WARNING: this method makes it very easy to fill all your memory when training
|
|
generated templates on any real-world corpus
|
|
|
|
:param featurelists: lists of Features, whose Cartesian product will return a set of Templates
|
|
:type featurelists: list of (list of Features)
|
|
:param combinations: given n featurelists: if combinations=k, all generated Templates will have
|
|
k features; if combinations=(k1,k2) they will have k1..k2 features; if None, defaults to 1..n
|
|
:type combinations: None, int, or (int, int)
|
|
:param skipintersecting: if True, do not output intersecting Templates (non-disjoint positions for some feature)
|
|
:type skipintersecting: bool
|
|
:returns: generator of Templates
|
|
|
|
"""
|
|
|
|
def nonempty_powerset(xs): # xs is a list
|
|
# itertools docnonempty_powerset([1,2,3]) --> (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)
|
|
|
|
# find the correct tuple given combinations, one of {None, k, (k1,k2)}
|
|
k = combinations # for brevity
|
|
combrange = (
|
|
(1, len(xs) + 1)
|
|
if k is None
|
|
else (k, k + 1) # n over 1 .. n over n (all non-empty combinations)
|
|
if isinstance(k, int)
|
|
else (k[0], k[1] + 1) # n over k (only
|
|
) # n over k1, n over k1+1... n over k2
|
|
return it.chain.from_iterable(
|
|
it.combinations(xs, r) for r in range(*combrange)
|
|
)
|
|
|
|
seentemplates = set()
|
|
for picks in nonempty_powerset(featurelists):
|
|
for pick in it.product(*picks):
|
|
if any(
|
|
i != j and x.issuperset(y)
|
|
for (i, x) in enumerate(pick)
|
|
for (j, y) in enumerate(pick)
|
|
):
|
|
continue
|
|
if skipintersecting and any(
|
|
i != j and x.intersects(y)
|
|
for (i, x) in enumerate(pick)
|
|
for (j, y) in enumerate(pick)
|
|
):
|
|
continue
|
|
thistemplate = cls(*sorted(pick))
|
|
strpick = str(thistemplate)
|
|
#!!FIXME --this is hackish
|
|
if strpick in seentemplates: # already added
|
|
cls._poptemplate()
|
|
continue
|
|
seentemplates.add(strpick)
|
|
yield thistemplate
|
|
|
|
@classmethod
|
|
def _cleartemplates(cls):
|
|
cls.ALLTEMPLATES = []
|
|
|
|
@classmethod
|
|
def _poptemplate(cls):
|
|
return cls.ALLTEMPLATES.pop() if cls.ALLTEMPLATES else None
|