You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
159 lines
5.9 KiB
Python
159 lines
5.9 KiB
Python
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
from __future__ import division
|
|
|
|
from builtins import str, bytes, dict, int
|
|
from builtins import map, zip
|
|
from builtins import range
|
|
|
|
import os
|
|
import sys
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
|
import random
|
|
|
|
from collections import defaultdict
|
|
from pattern.text import Model
|
|
from pattern.vector import shuffled, SLP
|
|
from pattern.en import lexicon, parsetree
|
|
from random import seed
|
|
|
|
from io import open
|
|
|
|
# This example demonstrates how a Perceptron classifier
|
|
# can be used to construct an English language model
|
|
# (i.e., a classifier that predicts part-of-speech tags),
|
|
# by learning from a training set of tagged sentences.
|
|
|
|
# First we need training data: a corpus of manually annotated (= tagged) sentences.
|
|
# Typically, Penn Treebank is used, which contains texts from the Wall Street Journal (WSJ).
|
|
# In this example we will use the freely available Open American National Corpus (OANC).
|
|
|
|
print("load training data...")
|
|
|
|
|
|
def corpus(path, encoding="utf-8"):
|
|
""" Yields sentences of (word, tag)-tuples from the given corpus,
|
|
which is a .txt file with a sentence on each line,
|
|
with slash-encoded tokens (e.g., the/DT cat/NN).
|
|
"""
|
|
for s in open(path, encoding=encoding):
|
|
s = list(map(lambda w: w.split("/"), s.strip().split(" ")))
|
|
s = list(map(lambda w: (w[0].replace("&slash;", "/"), w[1]), s))
|
|
yield s
|
|
|
|
# The corpus is included in the Pattern download zip, in pattern/test/corpora:
|
|
path = os.path.join(os.path.dirname(__file__), "..", "..", "test", "corpora", "tagged-en-oanc.txt")
|
|
data = list(corpus(path))
|
|
|
|
# A parser is typically based on a lexicon of known words (aka a tag dictionary),
|
|
# that contains frequent words and their most frequent part-of-speech tag.
|
|
# This approach is fast. However, some words can have more than one tag,
|
|
# depending on their context in the sentence (e.g., "a can" vs. "can I").
|
|
|
|
# When we train a language model (i.e., a classifier),
|
|
# we want to make sure that it captures all ambiguity,
|
|
# ignoring ambiguous entries in the lexicon,
|
|
# handling them with the classifier instead.
|
|
|
|
# For example, the lexicon in pattern.en will always tag "about" as IN (preposition),
|
|
# even though it can also be used as RB (adverb) in about 25% of the cases.
|
|
|
|
# We will add "about" to the set of words in the lexicon to ignore
|
|
# when using a language model.
|
|
|
|
print("load training lexicon...")
|
|
|
|
f = defaultdict(lambda: defaultdict(int)) # {word1: {tag1: count, tag2: count, ...}}
|
|
for s in data:
|
|
for w, tag in s:
|
|
f[w][tag] += 1
|
|
|
|
known, unknown = set(), set()
|
|
for w, tags in f.items():
|
|
n = sum(tags.values()) # total count
|
|
m = sorted(tags, key=tags.__getitem__, reverse=True)[0] # most frequent tag
|
|
if float(tags[m]) / n >= 0.97 and n > 1:
|
|
# Words that are always handled by the lexicon.
|
|
known.add(w)
|
|
if float(tags[m]) / n < 0.92 and w in lexicon:
|
|
# Words in the lexicon that should be ignored and handled by the model.
|
|
unknown.add(w)
|
|
|
|
# A language model is a classifier (e.g., NB, KNN, SVM, SLP)
|
|
# trained on words and their context (= words to the left & right in sentence),
|
|
# that predicts the part-of-speech tag of unknown words.
|
|
|
|
# Take a look at the Model class in pattern/text/__init__.py.
|
|
# You'll see an internal Model._v() method
|
|
# that creates a training vector from a given word and its context,
|
|
# using information such as word suffix, first letter (i.e., for proper nouns),
|
|
# the part-of-speech tags of preceding words, surrounding tags, etc.
|
|
|
|
# Perceptron (SLP, single-layer averaged perceptron) works well for language models.
|
|
# Perceptron is an error-driven classifier.
|
|
# When given a training example (e.g., tagged word + surrounding words),
|
|
# it will check if it could correctly predict this example.
|
|
# If not, it will adjust its weights.
|
|
# So the accuracy of the perceptron can be improved significantly
|
|
# by training in multiple iterations, averaging out all weights.
|
|
|
|
# This will take several minutes.
|
|
# If you want it to run faster for experimentation,
|
|
# use less iterations or less data in the code below:
|
|
|
|
print("training model...")
|
|
|
|
seed(0) # Lock random list shuffling so we can compare.
|
|
|
|
m = Model(known=known, unknown=unknown, classifier=SLP())
|
|
for iteration in range(5):
|
|
for s in shuffled(data[:20000]):
|
|
prev = None
|
|
next = None
|
|
for i, (w, tag) in enumerate(s):
|
|
if i < len(s) - 1:
|
|
next = s[i + 1]
|
|
m.train(w, tag, prev, next)
|
|
prev = (w, tag)
|
|
next = None
|
|
|
|
f = os.path.join(os.path.dirname(__file__), "en-model.slp")
|
|
m.save(f, final=True)
|
|
|
|
# Each parser in Pattern (pattern.en, pattern.es, pattern.it, ...)
|
|
# assumes that a lexicon of known words and their most frequent tag is available,
|
|
# along with some rules for morphology (suffixes, e.g., -ly = adverb)
|
|
# and context (surrounding words) for unknown words.
|
|
|
|
# If a language model is also available, it overrides these (simpler) rules.
|
|
# For English, this can raise accuracy from about 94% up to about 97%,
|
|
# and makes the parses about 3x faster.
|
|
|
|
print("loading model...")
|
|
|
|
f = os.path.join(os.path.dirname(__file__), "en-model.slp")
|
|
lexicon.model = Model.load(f, lexicon)
|
|
|
|
# To test the accuracy of the language model,
|
|
# we can compare a tagged corpus to the predicted tags.
|
|
# This corpus must be different from the one used for training.
|
|
# Typically, sections 22, 23 and 24 of the WSJ are used.
|
|
|
|
# Note that the WSJ contains standardized English.
|
|
# The accuracy will be lower when tested on, for example, informal tweets.
|
|
# A different classifier could be trained for informal language use.
|
|
|
|
print("testing...")
|
|
|
|
i, n = 0, 0
|
|
for s1 in data[-5000:]:
|
|
s2 = " ".join(w for w, tag in s1)
|
|
s2 = parsetree(s2, tokenize=False)
|
|
s2 = ((w.string, w.tag or "") for w in s2[0])
|
|
for (w1, tag1), (w2, tag2) in zip(s1, s2):
|
|
if tag1 == tag2.split("-")[0]: # NNP-PERS => NNP
|
|
i += 1
|
|
n += 1
|
|
|
|
print(float(i) / n) # accuracy
|