You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

159 lines
5.9 KiB
Python

from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from builtins import str, bytes, dict, int
from builtins import map, zip
from builtins import range
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
import random
from collections import defaultdict
from pattern.text import Model
from pattern.vector import shuffled, SLP
from pattern.en import lexicon, parsetree
from random import seed
from io import open
# This example demonstrates how a Perceptron classifier
# can be used to construct an English language model
# (i.e., a classifier that predicts part-of-speech tags),
# by learning from a training set of tagged sentences.
# First we need training data: a corpus of manually annotated (= tagged) sentences.
# Typically, Penn Treebank is used, which contains texts from the Wall Street Journal (WSJ).
# In this example we will use the freely available Open American National Corpus (OANC).
print("load training data...")
def corpus(path, encoding="utf-8"):
""" Yields sentences of (word, tag)-tuples from the given corpus,
which is a .txt file with a sentence on each line,
with slash-encoded tokens (e.g., the/DT cat/NN).
"""
for s in open(path, encoding=encoding):
s = list(map(lambda w: w.split("/"), s.strip().split(" ")))
s = list(map(lambda w: (w[0].replace("&slash;", "/"), w[1]), s))
yield s
# The corpus is included in the Pattern download zip, in pattern/test/corpora:
path = os.path.join(os.path.dirname(__file__), "..", "..", "test", "corpora", "tagged-en-oanc.txt")
data = list(corpus(path))
# A parser is typically based on a lexicon of known words (aka a tag dictionary),
# that contains frequent words and their most frequent part-of-speech tag.
# This approach is fast. However, some words can have more than one tag,
# depending on their context in the sentence (e.g., "a can" vs. "can I").
# When we train a language model (i.e., a classifier),
# we want to make sure that it captures all ambiguity,
# ignoring ambiguous entries in the lexicon,
# handling them with the classifier instead.
# For example, the lexicon in pattern.en will always tag "about" as IN (preposition),
# even though it can also be used as RB (adverb) in about 25% of the cases.
# We will add "about" to the set of words in the lexicon to ignore
# when using a language model.
print("load training lexicon...")
f = defaultdict(lambda: defaultdict(int)) # {word1: {tag1: count, tag2: count, ...}}
for s in data:
for w, tag in s:
f[w][tag] += 1
known, unknown = set(), set()
for w, tags in f.items():
n = sum(tags.values()) # total count
m = sorted(tags, key=tags.__getitem__, reverse=True)[0] # most frequent tag
if float(tags[m]) / n >= 0.97 and n > 1:
# Words that are always handled by the lexicon.
known.add(w)
if float(tags[m]) / n < 0.92 and w in lexicon:
# Words in the lexicon that should be ignored and handled by the model.
unknown.add(w)
# A language model is a classifier (e.g., NB, KNN, SVM, SLP)
# trained on words and their context (= words to the left & right in sentence),
# that predicts the part-of-speech tag of unknown words.
# Take a look at the Model class in pattern/text/__init__.py.
# You'll see an internal Model._v() method
# that creates a training vector from a given word and its context,
# using information such as word suffix, first letter (i.e., for proper nouns),
# the part-of-speech tags of preceding words, surrounding tags, etc.
# Perceptron (SLP, single-layer averaged perceptron) works well for language models.
# Perceptron is an error-driven classifier.
# When given a training example (e.g., tagged word + surrounding words),
# it will check if it could correctly predict this example.
# If not, it will adjust its weights.
# So the accuracy of the perceptron can be improved significantly
# by training in multiple iterations, averaging out all weights.
# This will take several minutes.
# If you want it to run faster for experimentation,
# use less iterations or less data in the code below:
print("training model...")
seed(0) # Lock random list shuffling so we can compare.
m = Model(known=known, unknown=unknown, classifier=SLP())
for iteration in range(5):
for s in shuffled(data[:20000]):
prev = None
next = None
for i, (w, tag) in enumerate(s):
if i < len(s) - 1:
next = s[i + 1]
m.train(w, tag, prev, next)
prev = (w, tag)
next = None
f = os.path.join(os.path.dirname(__file__), "en-model.slp")
m.save(f, final=True)
# Each parser in Pattern (pattern.en, pattern.es, pattern.it, ...)
# assumes that a lexicon of known words and their most frequent tag is available,
# along with some rules for morphology (suffixes, e.g., -ly = adverb)
# and context (surrounding words) for unknown words.
# If a language model is also available, it overrides these (simpler) rules.
# For English, this can raise accuracy from about 94% up to about 97%,
# and makes the parses about 3x faster.
print("loading model...")
f = os.path.join(os.path.dirname(__file__), "en-model.slp")
lexicon.model = Model.load(f, lexicon)
# To test the accuracy of the language model,
# we can compare a tagged corpus to the predicted tags.
# This corpus must be different from the one used for training.
# Typically, sections 22, 23 and 24 of the WSJ are used.
# Note that the WSJ contains standardized English.
# The accuracy will be lower when tested on, for example, informal tweets.
# A different classifier could be trained for informal language use.
print("testing...")
i, n = 0, 0
for s1 in data[-5000:]:
s2 = " ".join(w for w, tag in s1)
s2 = parsetree(s2, tokenize=False)
s2 = ((w.string, w.tag or "") for w in s2[0])
for (w1, tag1), (w2, tag2) in zip(s1, s2):
if tag1 == tag2.split("-")[0]: # NNP-PERS => NNP
i += 1
n += 1
print(float(i) / n) # accuracy