You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

451 lines
16 KiB
Python

# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
# Copyright (C) 2001-2020 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from collections import defaultdict, Counter
from nltk.tag import TaggerI
from nltk.tbl import Feature, Template
from nltk import jsontags
######################################################################
# Brill Templates
######################################################################
@jsontags.register_tag
class Word(Feature):
"""
Feature which examines the text (word) of nearby tokens.
"""
json_tag = "nltk.tag.brill.Word"
@staticmethod
def extract_property(tokens, index):
"""@return: The given token's text."""
return tokens[index][0]
@jsontags.register_tag
class Pos(Feature):
"""
Feature which examines the tags of nearby tokens.
"""
json_tag = "nltk.tag.brill.Pos"
@staticmethod
def extract_property(tokens, index):
"""@return: The given token's tag."""
return tokens[index][1]
def nltkdemo18():
"""
Return 18 templates, from the original nltk demo, in multi-feature syntax
"""
return [
Template(Pos([-1])),
Template(Pos([1])),
Template(Pos([-2])),
Template(Pos([2])),
Template(Pos([-2, -1])),
Template(Pos([1, 2])),
Template(Pos([-3, -2, -1])),
Template(Pos([1, 2, 3])),
Template(Pos([-1]), Pos([1])),
Template(Word([-1])),
Template(Word([1])),
Template(Word([-2])),
Template(Word([2])),
Template(Word([-2, -1])),
Template(Word([1, 2])),
Template(Word([-3, -2, -1])),
Template(Word([1, 2, 3])),
Template(Word([-1]), Word([1])),
]
def nltkdemo18plus():
"""
Return 18 templates, from the original nltk demo, and additionally a few
multi-feature ones (the motivation is easy comparison with nltkdemo18)
"""
return nltkdemo18() + [
Template(Word([-1]), Pos([1])),
Template(Pos([-1]), Word([1])),
Template(Word([-1]), Word([0]), Pos([1])),
Template(Pos([-1]), Word([0]), Word([1])),
Template(Pos([-1]), Word([0]), Pos([1])),
]
def fntbl37():
"""
Return 37 templates taken from the postagging task of the
fntbl distribution http://www.cs.jhu.edu/~rflorian/fntbl/
(37 is after excluding a handful which do not condition on Pos[0];
fntbl can do that but the current nltk implementation cannot.)
"""
return [
Template(Word([0]), Word([1]), Word([2])),
Template(Word([-1]), Word([0]), Word([1])),
Template(Word([0]), Word([-1])),
Template(Word([0]), Word([1])),
Template(Word([0]), Word([2])),
Template(Word([0]), Word([-2])),
Template(Word([1, 2])),
Template(Word([-2, -1])),
Template(Word([1, 2, 3])),
Template(Word([-3, -2, -1])),
Template(Word([0]), Pos([2])),
Template(Word([0]), Pos([-2])),
Template(Word([0]), Pos([1])),
Template(Word([0]), Pos([-1])),
Template(Word([0])),
Template(Word([-2])),
Template(Word([2])),
Template(Word([1])),
Template(Word([-1])),
Template(Pos([-1]), Pos([1])),
Template(Pos([1]), Pos([2])),
Template(Pos([-1]), Pos([-2])),
Template(Pos([1])),
Template(Pos([-1])),
Template(Pos([-2])),
Template(Pos([2])),
Template(Pos([1, 2, 3])),
Template(Pos([1, 2])),
Template(Pos([-3, -2, -1])),
Template(Pos([-2, -1])),
Template(Pos([1]), Word([0]), Word([1])),
Template(Pos([1]), Word([0]), Word([-1])),
Template(Pos([-1]), Word([-1]), Word([0])),
Template(Pos([-1]), Word([0]), Word([1])),
Template(Pos([-2]), Pos([-1])),
Template(Pos([1]), Pos([2])),
Template(Pos([1]), Pos([2]), Word([1])),
]
def brill24():
"""
Return 24 templates of the seminal TBL paper, Brill (1995)
"""
return [
Template(Pos([-1])),
Template(Pos([1])),
Template(Pos([-2])),
Template(Pos([2])),
Template(Pos([-2, -1])),
Template(Pos([1, 2])),
Template(Pos([-3, -2, -1])),
Template(Pos([1, 2, 3])),
Template(Pos([-1]), Pos([1])),
Template(Pos([-2]), Pos([-1])),
Template(Pos([1]), Pos([2])),
Template(Word([-1])),
Template(Word([1])),
Template(Word([-2])),
Template(Word([2])),
Template(Word([-2, -1])),
Template(Word([1, 2])),
Template(Word([-1, 0])),
Template(Word([0, 1])),
Template(Word([0])),
Template(Word([-1]), Pos([-1])),
Template(Word([1]), Pos([1])),
Template(Word([0]), Word([-1]), Pos([-1])),
Template(Word([0]), Word([1]), Pos([1])),
]
def describe_template_sets():
"""
Print the available template sets in this demo, with a short description"
"""
import inspect
import sys
# a bit of magic to get all functions in this module
templatesets = inspect.getmembers(sys.modules[__name__], inspect.isfunction)
for (name, obj) in templatesets:
if name == "describe_template_sets":
continue
print(name, obj.__doc__, "\n")
######################################################################
# The Brill Tagger
######################################################################
@jsontags.register_tag
class BrillTagger(TaggerI):
"""
Brill's transformational rule-based tagger. Brill taggers use an
initial tagger (such as ``tag.DefaultTagger``) to assign an initial
tag sequence to a text; and then apply an ordered list of
transformational rules to correct the tags of individual tokens.
These transformation rules are specified by the ``TagRule``
interface.
Brill taggers can be created directly, from an initial tagger and
a list of transformational rules; but more often, Brill taggers
are created by learning rules from a training corpus, using one
of the TaggerTrainers available.
"""
json_tag = "nltk.tag.BrillTagger"
def __init__(self, initial_tagger, rules, training_stats=None):
"""
:param initial_tagger: The initial tagger
:type initial_tagger: TaggerI
:param rules: An ordered list of transformation rules that
should be used to correct the initial tagging.
:type rules: list(TagRule)
:param training_stats: A dictionary of statistics collected
during training, for possible later use
:type training_stats: dict
"""
self._initial_tagger = initial_tagger
self._rules = tuple(rules)
self._training_stats = training_stats
def encode_json_obj(self):
return self._initial_tagger, self._rules, self._training_stats
@classmethod
def decode_json_obj(cls, obj):
_initial_tagger, _rules, _training_stats = obj
return cls(_initial_tagger, _rules, _training_stats)
def rules(self):
"""
Return the ordered list of transformation rules that this tagger has learnt
:return: the ordered list of transformation rules that correct the initial tagging
:rtype: list of Rules
"""
return self._rules
def train_stats(self, statistic=None):
"""
Return a named statistic collected during training, or a dictionary of all
available statistics if no name given
:param statistic: name of statistic
:type statistic: str
:return: some statistic collected during training of this tagger
:rtype: any (but usually a number)
"""
if statistic is None:
return self._training_stats
else:
return self._training_stats.get(statistic)
def tag(self, tokens):
# Inherit documentation from TaggerI
# Run the initial tagger.
tagged_tokens = self._initial_tagger.tag(tokens)
# Create a dictionary that maps each tag to a list of the
# indices of tokens that have that tag.
tag_to_positions = defaultdict(set)
for i, (token, tag) in enumerate(tagged_tokens):
tag_to_positions[tag].add(i)
# Apply each rule, in order. Only try to apply rules at
# positions that have the desired original tag.
for rule in self._rules:
# Find the positions where it might apply
positions = tag_to_positions.get(rule.original_tag, [])
# Apply the rule at those positions.
changed = rule.apply(tagged_tokens, positions)
# Update tag_to_positions with the positions of tags that
# were modified.
for i in changed:
tag_to_positions[rule.original_tag].remove(i)
tag_to_positions[rule.replacement_tag].add(i)
return tagged_tokens
def print_template_statistics(self, test_stats=None, printunused=True):
"""
Print a list of all templates, ranked according to efficiency.
If test_stats is available, the templates are ranked according to their
relative contribution (summed for all rules created from a given template,
weighted by score) to the performance on the test set. If no test_stats, then
statistics collected during training are used instead. There is also
an unweighted measure (just counting the rules). This is less informative,
though, as many low-score rules will appear towards end of training.
:param test_stats: dictionary of statistics collected during testing
:type test_stats: dict of str -> any (but usually numbers)
:param printunused: if True, print a list of all unused templates
:type printunused: bool
:return: None
:rtype: None
"""
tids = [r.templateid for r in self._rules]
train_stats = self.train_stats()
trainscores = train_stats["rulescores"]
assert len(trainscores) == len(tids), (
"corrupt statistics: "
"{0} train scores for {1} rules".format(trainscores, tids)
)
template_counts = Counter(tids)
weighted_traincounts = Counter()
for (tid, score) in zip(tids, trainscores):
weighted_traincounts[tid] += score
tottrainscores = sum(trainscores)
# det_tplsort() is for deterministic sorting;
# the otherwise convenient Counter.most_common() unfortunately
# does not break ties deterministically
# between python versions and will break cross-version tests
def det_tplsort(tpl_value):
return (tpl_value[1], repr(tpl_value[0]))
def print_train_stats():
print(
"TEMPLATE STATISTICS (TRAIN) {0} templates, {1} rules)".format(
len(template_counts), len(tids)
)
)
print(
"TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} "
"final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats)
)
head = "#ID | Score (train) | #Rules | Template"
print(head, "\n", "-" * len(head), sep="")
train_tplscores = sorted(
weighted_traincounts.items(), key=det_tplsort, reverse=True
)
for (tid, trainscore) in train_tplscores:
s = "{0} | {1:5d} {2:5.3f} |{3:4d} {4:.3f} | {5}".format(
tid,
trainscore,
trainscore / tottrainscores,
template_counts[tid],
template_counts[tid] / len(tids),
Template.ALLTEMPLATES[int(tid)],
)
print(s)
def print_testtrain_stats():
testscores = test_stats["rulescores"]
print(
"TEMPLATE STATISTICS (TEST AND TRAIN) ({0} templates, {1} rules)".format(
len(template_counts), len(tids)
)
)
print(
"TEST ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} "
"final: {finalerrors:5d} {finalacc:.4f} ".format(**test_stats)
)
print(
"TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} "
"final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats)
)
weighted_testcounts = Counter()
for (tid, score) in zip(tids, testscores):
weighted_testcounts[tid] += score
tottestscores = sum(testscores)
head = "#ID | Score (test) | Score (train) | #Rules | Template"
print(head, "\n", "-" * len(head), sep="")
test_tplscores = sorted(
weighted_testcounts.items(), key=det_tplsort, reverse=True
)
for (tid, testscore) in test_tplscores:
s = "{0:s} |{1:5d} {2:6.3f} | {3:4d} {4:.3f} |{5:4d} {6:.3f} | {7:s}".format(
tid,
testscore,
testscore / tottestscores,
weighted_traincounts[tid],
weighted_traincounts[tid] / tottrainscores,
template_counts[tid],
template_counts[tid] / len(tids),
Template.ALLTEMPLATES[int(tid)],
)
print(s)
def print_unused_templates():
usedtpls = set(int(tid) for tid in tids)
unused = [
(tid, tpl)
for (tid, tpl) in enumerate(Template.ALLTEMPLATES)
if tid not in usedtpls
]
print("UNUSED TEMPLATES ({0})".format(len(unused)))
for (tid, tpl) in unused:
print("{0:03d} {1:s}".format(tid, str(tpl)))
if test_stats is None:
print_train_stats()
else:
print_testtrain_stats()
print()
if printunused:
print_unused_templates()
print()
def batch_tag_incremental(self, sequences, gold):
"""
Tags by applying each rule to the entire corpus (rather than all rules to a
single sequence). The point is to collect statistics on the test set for
individual rules.
NOTE: This is inefficient (does not build any index, so will traverse the entire
corpus N times for N rules) -- usually you would not care about statistics for
individual rules and thus use batch_tag() instead
:param sequences: lists of token sequences (sentences, in some applications) to be tagged
:type sequences: list of list of strings
:param gold: the gold standard
:type gold: list of list of strings
:returns: tuple of (tagged_sequences, ordered list of rule scores (one for each rule))
"""
def counterrors(xs):
return sum(t[1] != g[1] for pair in zip(xs, gold) for (t, g) in zip(*pair))
testing_stats = {}
testing_stats["tokencount"] = sum(len(t) for t in sequences)
testing_stats["sequencecount"] = len(sequences)
tagged_tokenses = [self._initial_tagger.tag(tokens) for tokens in sequences]
testing_stats["initialerrors"] = counterrors(tagged_tokenses)
testing_stats["initialacc"] = (
1 - testing_stats["initialerrors"] / testing_stats["tokencount"]
)
# Apply each rule to the entire corpus, in order
errors = [testing_stats["initialerrors"]]
for rule in self._rules:
for tagged_tokens in tagged_tokenses:
rule.apply(tagged_tokens)
errors.append(counterrors(tagged_tokenses))
testing_stats["rulescores"] = [
err0 - err1 for (err0, err1) in zip(errors, errors[1:])
]
testing_stats["finalerrors"] = errors[-1]
testing_stats["finalacc"] = (
1 - testing_stats["finalerrors"] / testing_stats["tokencount"]
)
return (tagged_tokenses, testing_stats)