You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1505 lines
56 KiB
Python

# Natural Language Toolkit: Regexp Chunk Parser Application
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
A graphical tool for exploring the regular expression based chunk
parser ``nltk.chunk.RegexpChunkParser``.
"""
# Todo: Add a way to select the development set from the menubar. This
# might just need to be a selection box (conll vs treebank etc) plus
# configuration parameters to select what's being chunked (eg VP vs NP)
# and what part of the data is being used as the development set.
from __future__ import division
import time
import textwrap
import re
import random
from six.moves.tkinter import (
Button,
Canvas,
Checkbutton,
Frame,
IntVar,
Label,
Menu,
Scrollbar,
Text,
Tk,
)
from six.moves.tkinter_tkfiledialog import askopenfilename, asksaveasfilename
from six.moves.tkinter_font import Font
from nltk.tree import Tree
from nltk.util import in_idle
from nltk.draw.util import ShowText
from nltk.corpus import conll2000, treebank_chunk
from nltk.chunk import ChunkScore, RegexpChunkParser
from nltk.chunk.regexp import RegexpChunkRule
class RegexpChunkApp(object):
"""
A graphical tool for exploring the regular expression based chunk
parser ``nltk.chunk.RegexpChunkParser``.
See ``HELP`` for instructional text.
"""
##/////////////////////////////////////////////////////////////////
## Help Text
##/////////////////////////////////////////////////////////////////
#: A dictionary mapping from part of speech tags to descriptions,
#: which is used in the help text. (This should probably live with
#: the conll and/or treebank corpus instead.)
TAGSET = {
'CC': 'Coordinating conjunction',
'PRP$': 'Possessive pronoun',
'CD': 'Cardinal number',
'RB': 'Adverb',
'DT': 'Determiner',
'RBR': 'Adverb, comparative',
'EX': 'Existential there',
'RBS': 'Adverb, superlative',
'FW': 'Foreign word',
'RP': 'Particle',
'JJ': 'Adjective',
'TO': 'to',
'JJR': 'Adjective, comparative',
'UH': 'Interjection',
'JJS': 'Adjective, superlative',
'VB': 'Verb, base form',
'LS': 'List item marker',
'VBD': 'Verb, past tense',
'MD': 'Modal',
'NNS': 'Noun, plural',
'NN': 'Noun, singular or masps',
'VBN': 'Verb, past participle',
'VBZ': 'Verb,3rd ps. sing. present',
'NNP': 'Proper noun, singular',
'NNPS': 'Proper noun plural',
'WDT': 'wh-determiner',
'PDT': 'Predeterminer',
'WP': 'wh-pronoun',
'POS': 'Possessive ending',
'WP$': 'Possessive wh-pronoun',
'PRP': 'Personal pronoun',
'WRB': 'wh-adverb',
'(': 'open parenthesis',
')': 'close parenthesis',
'``': 'open quote',
',': 'comma',
"''": 'close quote',
'.': 'period',
'#': 'pound sign (currency marker)',
'$': 'dollar sign (currency marker)',
'IN': 'Preposition/subord. conjunction',
'SYM': 'Symbol (mathematical or scientific)',
'VBG': 'Verb, gerund/present participle',
'VBP': 'Verb, non-3rd ps. sing. present',
':': 'colon',
}
#: Contents for the help box. This is a list of tuples, one for
#: each help page, where each tuple has four elements:
#: - A title (displayed as a tab)
#: - A string description of tabstops (see Tkinter.Text for details)
#: - The text contents for the help page. You can use expressions
#: like <red>...</red> to colorize the text; see ``HELP_AUTOTAG``
#: for a list of tags you can use for colorizing.
HELP = [
(
'Help',
'20',
"Welcome to the regular expression chunk-parser grammar editor. "
"You can use this editor to develop and test chunk parser grammars "
"based on NLTK's RegexpChunkParser class.\n\n"
# Help box.
"Use this box ('Help') to learn more about the editor; click on the "
"tabs for help on specific topics:"
"<indent>\n"
"Rules: grammar rule types\n"
"Regexps: regular expression syntax\n"
"Tags: part of speech tags\n</indent>\n"
# Grammar.
"Use the upper-left box ('Grammar') to edit your grammar. "
"Each line of your grammar specifies a single 'rule', "
"which performs an action such as creating a chunk or merging "
"two chunks.\n\n"
# Dev set.
"The lower-left box ('Development Set') runs your grammar on the "
"development set, and displays the results. "
"Your grammar's chunks are <highlight>highlighted</highlight>, and "
"the correct (gold standard) chunks are "
"<underline>underlined</underline>. If they "
"match, they are displayed in <green>green</green>; otherwise, "
"they are displayed in <red>red</red>. The box displays a single "
"sentence from the development set at a time; use the scrollbar or "
"the next/previous buttons view additional sentences.\n\n"
# Performance
"The lower-right box ('Evaluation') tracks the performance of "
"your grammar on the development set. The 'precision' axis "
"indicates how many of your grammar's chunks are correct; and "
"the 'recall' axis indicates how many of the gold standard "
"chunks your system generated. Typically, you should try to "
"design a grammar that scores high on both metrics. The "
"exact precision and recall of the current grammar, as well "
"as their harmonic mean (the 'f-score'), are displayed in "
"the status bar at the bottom of the window.",
),
(
'Rules',
'10',
"<h1>{...regexp...}</h1>"
"<indent>\nChunk rule: creates new chunks from words matching "
"regexp.</indent>\n\n"
"<h1>}...regexp...{</h1>"
"<indent>\nChink rule: removes words matching regexp from existing "
"chunks.</indent>\n\n"
"<h1>...regexp1...}{...regexp2...</h1>"
"<indent>\nSplit rule: splits chunks that match regexp1 followed by "
"regexp2 in two.</indent>\n\n"
"<h1>...regexp...{}...regexp...</h1>"
"<indent>\nMerge rule: joins consecutive chunks that match regexp1 "
"and regexp2</indent>\n",
),
(
'Regexps',
'10 60',
# "Regular Expression Syntax Summary:\n\n"
"<h1>Pattern\t\tMatches...</h1>\n"
"<hangindent>"
"\t<<var>T</var>>\ta word with tag <var>T</var> "
"(where <var>T</var> may be a regexp).\n"
"\t<var>x</var>?\tan optional <var>x</var>\n"
"\t<var>x</var>+\ta sequence of 1 or more <var>x</var>'s\n"
"\t<var>x</var>*\ta sequence of 0 or more <var>x</var>'s\n"
"\t<var>x</var>|<var>y</var>\t<var>x</var> or <var>y</var>\n"
"\t.\tmatches any character\n"
"\t(<var>x</var>)\tTreats <var>x</var> as a group\n"
"\t# <var>x...</var>\tTreats <var>x...</var> "
"(to the end of the line) as a comment\n"
"\t\\<var>C</var>\tmatches character <var>C</var> "
"(useful when <var>C</var> is a special character "
"like + or #)\n"
"</hangindent>"
"\n<h1>Examples:</h1>\n"
"<hangindent>"
'\t<regexp><NN></regexp>\n'
'\t\tMatches <match>"cow/NN"</match>\n'
'\t\tMatches <match>"green/NN"</match>\n'
'\t<regexp><VB.*></regexp>\n'
'\t\tMatches <match>"eating/VBG"</match>\n'
'\t\tMatches <match>"ate/VBD"</match>\n'
'\t<regexp><IN><DT><NN></regexp>\n'
'\t\tMatches <match>"on/IN the/DT car/NN"</match>\n'
'\t<regexp><RB>?<VBD></regexp>\n'
'\t\tMatches <match>"ran/VBD"</match>\n'
'\t\tMatches <match>"slowly/RB ate/VBD"</match>\n'
'\t<regexp><\#><CD> # This is a comment...</regexp>\n'
'\t\tMatches <match>"#/# 100/CD"</match>\n'
"</hangindent>",
),
(
'Tags',
'10 60',
"<h1>Part of Speech Tags:</h1>\n"
+ '<hangindent>'
+ '<<TAGSET>>'
+ '</hangindent>\n', # this gets auto-substituted w/ self.TAGSET
),
]
HELP_AUTOTAG = [
('red', dict(foreground='#a00')),
('green', dict(foreground='#080')),
('highlight', dict(background='#ddd')),
('underline', dict(underline=True)),
('h1', dict(underline=True)),
('indent', dict(lmargin1=20, lmargin2=20)),
('hangindent', dict(lmargin1=0, lmargin2=60)),
('var', dict(foreground='#88f')),
('regexp', dict(foreground='#ba7')),
('match', dict(foreground='#6a6')),
]
##/////////////////////////////////////////////////////////////////
## Config Parmeters
##/////////////////////////////////////////////////////////////////
_EVAL_DELAY = 1
"""If the user has not pressed any key for this amount of time (in
seconds), and the current grammar has not been evaluated, then
the eval demon will evaluate it."""
_EVAL_CHUNK = 15
"""The number of sentences that should be evaluated by the eval
demon each time it runs."""
_EVAL_FREQ = 0.2
"""The frequency (in seconds) at which the eval demon is run"""
_EVAL_DEMON_MIN = 0.02
"""The minimum amount of time that the eval demon should take each time
it runs -- if it takes less than this time, _EVAL_CHUNK will be
modified upwards."""
_EVAL_DEMON_MAX = 0.04
"""The maximum amount of time that the eval demon should take each time
it runs -- if it takes more than this time, _EVAL_CHUNK will be
modified downwards."""
_GRAMMARBOX_PARAMS = dict(
width=40,
height=12,
background='#efe',
highlightbackground='#efe',
highlightthickness=1,
relief='groove',
border=2,
wrap='word',
)
_HELPBOX_PARAMS = dict(
width=15,
height=15,
background='#efe',
highlightbackground='#efe',
foreground='#555',
highlightthickness=1,
relief='groove',
border=2,
wrap='word',
)
_DEVSETBOX_PARAMS = dict(
width=70,
height=10,
background='#eef',
highlightbackground='#eef',
highlightthickness=1,
relief='groove',
border=2,
wrap='word',
tabs=(30,),
)
_STATUS_PARAMS = dict(background='#9bb', relief='groove', border=2)
_FONT_PARAMS = dict(family='helvetica', size=-20)
_FRAME_PARAMS = dict(background='#777', padx=2, pady=2, border=3)
_EVALBOX_PARAMS = dict(
background='#eef',
highlightbackground='#eef',
highlightthickness=1,
relief='groove',
border=2,
width=300,
height=280,
)
_BUTTON_PARAMS = dict(
background='#777', activebackground='#777', highlightbackground='#777'
)
_HELPTAB_BG_COLOR = '#aba'
_HELPTAB_FG_COLOR = '#efe'
_HELPTAB_FG_PARAMS = dict(background='#efe')
_HELPTAB_BG_PARAMS = dict(background='#aba')
_HELPTAB_SPACER = 6
def normalize_grammar(self, grammar):
# Strip comments
grammar = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', grammar)
# Normalize whitespace
grammar = re.sub(' +', ' ', grammar)
grammar = re.sub('\n\s+', '\n', grammar)
grammar = grammar.strip()
# [xx] Hack: automatically backslash $!
grammar = re.sub(r'([^\\])\$', r'\1\\$', grammar)
return grammar
def __init__(
self,
devset_name='conll2000',
devset=None,
grammar='',
chunk_label='NP',
tagset=None,
):
"""
:param devset_name: The name of the development set; used for
display & for save files. If either the name 'treebank'
or the name 'conll2000' is used, and devset is None, then
devset will be set automatically.
:param devset: A list of chunked sentences
:param grammar: The initial grammar to display.
:param tagset: Dictionary from tags to string descriptions, used
for the help page. Defaults to ``self.TAGSET``.
"""
self._chunk_label = chunk_label
if tagset is None:
tagset = self.TAGSET
self.tagset = tagset
# Named development sets:
if devset is None:
if devset_name == 'conll2000':
devset = conll2000.chunked_sents('train.txt') # [:100]
elif devset == 'treebank':
devset = treebank_chunk.chunked_sents() # [:100]
else:
raise ValueError('Unknown development set %s' % devset_name)
self.chunker = None
"""The chunker built from the grammar string"""
self.grammar = grammar
"""The unparsed grammar string"""
self.normalized_grammar = None
"""A normalized version of ``self.grammar``."""
self.grammar_changed = 0
"""The last time() that the grammar was changed."""
self.devset = devset
"""The development set -- a list of chunked sentences."""
self.devset_name = devset_name
"""The name of the development set (for save files)."""
self.devset_index = -1
"""The index into the development set of the first instance
that's currently being viewed."""
self._last_keypress = 0
"""The time() when a key was most recently pressed"""
self._history = []
"""A list of (grammar, precision, recall, fscore) tuples for
grammars that the user has already tried."""
self._history_index = 0
"""When the user is scrolling through previous grammars, this
is used to keep track of which grammar they're looking at."""
self._eval_grammar = None
"""The grammar that is being currently evaluated by the eval
demon."""
self._eval_normalized_grammar = None
"""A normalized copy of ``_eval_grammar``."""
self._eval_index = 0
"""The index of the next sentence in the development set that
should be looked at by the eval demon."""
self._eval_score = ChunkScore(chunk_label=chunk_label)
"""The ``ChunkScore`` object that's used to keep track of the score
of the current grammar on the development set."""
# Set up the main window.
top = self.top = Tk()
top.geometry('+50+50')
top.title('Regexp Chunk Parser App')
top.bind('<Control-q>', self.destroy)
# Varaible that restricts how much of the devset we look at.
self._devset_size = IntVar(top)
self._devset_size.set(100)
# Set up all the tkinter widgets
self._init_fonts(top)
self._init_widgets(top)
self._init_bindings(top)
self._init_menubar(top)
self.grammarbox.focus()
# If a grammar was given, then display it.
if grammar:
self.grammarbox.insert('end', grammar + '\n')
self.grammarbox.mark_set('insert', '1.0')
# Display the first item in the development set
self.show_devset(0)
self.update()
def _init_bindings(self, top):
top.bind('<Control-n>', self._devset_next)
top.bind('<Control-p>', self._devset_prev)
top.bind('<Control-t>', self.toggle_show_trace)
top.bind('<KeyPress>', self.update)
top.bind('<Control-s>', lambda e: self.save_grammar())
top.bind('<Control-o>', lambda e: self.load_grammar())
self.grammarbox.bind('<Control-t>', self.toggle_show_trace)
self.grammarbox.bind('<Control-n>', self._devset_next)
self.grammarbox.bind('<Control-p>', self._devset_prev)
# Redraw the eval graph when the window size changes
self.evalbox.bind('<Configure>', self._eval_plot)
def _init_fonts(self, top):
# TWhat's our font size (default=same as sysfont)
self._size = IntVar(top)
self._size.set(20)
self._font = Font(family='helvetica', size=-self._size.get())
self._smallfont = Font(
family='helvetica', size=-(int(self._size.get() * 14 // 20))
)
def _init_menubar(self, parent):
menubar = Menu(parent)
filemenu = Menu(menubar, tearoff=0)
filemenu.add_command(label='Reset Application', underline=0, command=self.reset)
filemenu.add_command(
label='Save Current Grammar',
underline=0,
accelerator='Ctrl-s',
command=self.save_grammar,
)
filemenu.add_command(
label='Load Grammar',
underline=0,
accelerator='Ctrl-o',
command=self.load_grammar,
)
filemenu.add_command(
label='Save Grammar History', underline=13, command=self.save_history
)
filemenu.add_command(
label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-q'
)
menubar.add_cascade(label='File', underline=0, menu=filemenu)
viewmenu = Menu(menubar, tearoff=0)
viewmenu.add_radiobutton(
label='Tiny',
variable=self._size,
underline=0,
value=10,
command=self.resize,
)
viewmenu.add_radiobutton(
label='Small',
variable=self._size,
underline=0,
value=16,
command=self.resize,
)
viewmenu.add_radiobutton(
label='Medium',
variable=self._size,
underline=0,
value=20,
command=self.resize,
)
viewmenu.add_radiobutton(
label='Large',
variable=self._size,
underline=0,
value=24,
command=self.resize,
)
viewmenu.add_radiobutton(
label='Huge',
variable=self._size,
underline=0,
value=34,
command=self.resize,
)
menubar.add_cascade(label='View', underline=0, menu=viewmenu)
devsetmenu = Menu(menubar, tearoff=0)
devsetmenu.add_radiobutton(
label='50 sentences',
variable=self._devset_size,
value=50,
command=self.set_devset_size,
)
devsetmenu.add_radiobutton(
label='100 sentences',
variable=self._devset_size,
value=100,
command=self.set_devset_size,
)
devsetmenu.add_radiobutton(
label='200 sentences',
variable=self._devset_size,
value=200,
command=self.set_devset_size,
)
devsetmenu.add_radiobutton(
label='500 sentences',
variable=self._devset_size,
value=500,
command=self.set_devset_size,
)
menubar.add_cascade(label='Development-Set', underline=0, menu=devsetmenu)
helpmenu = Menu(menubar, tearoff=0)
helpmenu.add_command(label='About', underline=0, command=self.about)
menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
parent.config(menu=menubar)
def toggle_show_trace(self, *e):
if self._showing_trace:
self.show_devset()
else:
self.show_trace()
return 'break'
_SCALE_N = 5 # center on the last 5 examples.
_DRAW_LINES = False
def _eval_plot(self, *e, **config):
width = config.get('width', self.evalbox.winfo_width())
height = config.get('height', self.evalbox.winfo_height())
# Clear the canvas
self.evalbox.delete('all')
# Draw the precision & recall labels.
tag = self.evalbox.create_text(
10, height // 2 - 10, justify='left', anchor='w', text='Precision'
)
left, right = self.evalbox.bbox(tag)[2] + 5, width - 10
tag = self.evalbox.create_text(
left + (width - left) // 2,
height - 10,
anchor='s',
text='Recall',
justify='center',
)
top, bot = 10, self.evalbox.bbox(tag)[1] - 10
# Draw masks for clipping the plot.
bg = self._EVALBOX_PARAMS['background']
self.evalbox.lower(
self.evalbox.create_rectangle(0, 0, left - 1, 5000, fill=bg, outline=bg)
)
self.evalbox.lower(
self.evalbox.create_rectangle(0, bot + 1, 5000, 5000, fill=bg, outline=bg)
)
# Calculate the plot's scale.
if self._autoscale.get() and len(self._history) > 1:
max_precision = max_recall = 0
min_precision = min_recall = 1
for i in range(1, min(len(self._history), self._SCALE_N + 1)):
grammar, precision, recall, fmeasure = self._history[-i]
min_precision = min(precision, min_precision)
min_recall = min(recall, min_recall)
max_precision = max(precision, max_precision)
max_recall = max(recall, max_recall)
# if max_precision-min_precision > max_recall-min_recall:
# min_recall -= (max_precision-min_precision)/2
# max_recall += (max_precision-min_precision)/2
# else:
# min_precision -= (max_recall-min_recall)/2
# max_precision += (max_recall-min_recall)/2
# if min_recall < 0:
# max_recall -= min_recall
# min_recall = 0
# if min_precision < 0:
# max_precision -= min_precision
# min_precision = 0
min_precision = max(min_precision - 0.01, 0)
min_recall = max(min_recall - 0.01, 0)
max_precision = min(max_precision + 0.01, 1)
max_recall = min(max_recall + 0.01, 1)
else:
min_precision = min_recall = 0
max_precision = max_recall = 1
# Draw the axis lines & grid lines
for i in range(11):
x = left + (right - left) * (
(i / 10.0 - min_recall) / (max_recall - min_recall)
)
y = bot - (bot - top) * (
(i / 10.0 - min_precision) / (max_precision - min_precision)
)
if left < x < right:
self.evalbox.create_line(x, top, x, bot, fill='#888')
if top < y < bot:
self.evalbox.create_line(left, y, right, y, fill='#888')
self.evalbox.create_line(left, top, left, bot)
self.evalbox.create_line(left, bot, right, bot)
# Display the plot's scale
self.evalbox.create_text(
left - 3,
bot,
justify='right',
anchor='se',
text='%d%%' % (100 * min_precision),
)
self.evalbox.create_text(
left - 3,
top,
justify='right',
anchor='ne',
text='%d%%' % (100 * max_precision),
)
self.evalbox.create_text(
left,
bot + 3,
justify='center',
anchor='nw',
text='%d%%' % (100 * min_recall),
)
self.evalbox.create_text(
right,
bot + 3,
justify='center',
anchor='ne',
text='%d%%' % (100 * max_recall),
)
# Display the scores.
prev_x = prev_y = None
for i, (_, precision, recall, fscore) in enumerate(self._history):
x = left + (right - left) * (
(recall - min_recall) / (max_recall - min_recall)
)
y = bot - (bot - top) * (
(precision - min_precision) / (max_precision - min_precision)
)
if i == self._history_index:
self.evalbox.create_oval(
x - 2, y - 2, x + 2, y + 2, fill='#0f0', outline='#000'
)
self.status['text'] = (
'Precision: %.2f%%\t' % (precision * 100)
+ 'Recall: %.2f%%\t' % (recall * 100)
+ 'F-score: %.2f%%' % (fscore * 100)
)
else:
self.evalbox.lower(
self.evalbox.create_oval(
x - 2, y - 2, x + 2, y + 2, fill='#afa', outline='#8c8'
)
)
if prev_x is not None and self._eval_lines.get():
self.evalbox.lower(
self.evalbox.create_line(prev_x, prev_y, x, y, fill='#8c8')
)
prev_x, prev_y = x, y
_eval_demon_running = False
def _eval_demon(self):
if self.top is None:
return
if self.chunker is None:
self._eval_demon_running = False
return
# Note our starting time.
t0 = time.time()
# If are still typing, then wait for them to finish.
if (
time.time() - self._last_keypress < self._EVAL_DELAY
and self.normalized_grammar != self._eval_normalized_grammar
):
self._eval_demon_running = True
return self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon)
# If the grammar changed, restart the evaluation.
if self.normalized_grammar != self._eval_normalized_grammar:
# Check if we've seen this grammar already. If so, then
# just use the old evaluation values.
for (g, p, r, f) in self._history:
if self.normalized_grammar == self.normalize_grammar(g):
self._history.append((g, p, r, f))
self._history_index = len(self._history) - 1
self._eval_plot()
self._eval_demon_running = False
self._eval_normalized_grammar = None
return
self._eval_index = 0
self._eval_score = ChunkScore(chunk_label=self._chunk_label)
self._eval_grammar = self.grammar
self._eval_normalized_grammar = self.normalized_grammar
# If the grammar is empty, the don't bother evaluating it, or
# recording it in history -- the score will just be 0.
if self.normalized_grammar.strip() == '':
# self._eval_index = self._devset_size.get()
self._eval_demon_running = False
return
# Score the next set of examples
for gold in self.devset[
self._eval_index : min(
self._eval_index + self._EVAL_CHUNK, self._devset_size.get()
)
]:
guess = self._chunkparse(gold.leaves())
self._eval_score.score(gold, guess)
# update our index in the devset.
self._eval_index += self._EVAL_CHUNK
# Check if we're done
if self._eval_index >= self._devset_size.get():
self._history.append(
(
self._eval_grammar,
self._eval_score.precision(),
self._eval_score.recall(),
self._eval_score.f_measure(),
)
)
self._history_index = len(self._history) - 1
self._eval_plot()
self._eval_demon_running = False
self._eval_normalized_grammar = None
else:
progress = 100 * self._eval_index / self._devset_size.get()
self.status['text'] = 'Evaluating on Development Set (%d%%)' % progress
self._eval_demon_running = True
self._adaptively_modify_eval_chunk(time.time() - t0)
self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon)
def _adaptively_modify_eval_chunk(self, t):
"""
Modify _EVAL_CHUNK to try to keep the amount of time that the
eval demon takes between _EVAL_DEMON_MIN and _EVAL_DEMON_MAX.
:param t: The amount of time that the eval demon took.
"""
if t > self._EVAL_DEMON_MAX and self._EVAL_CHUNK > 5:
self._EVAL_CHUNK = min(
self._EVAL_CHUNK - 1,
max(
int(self._EVAL_CHUNK * (self._EVAL_DEMON_MAX / t)),
self._EVAL_CHUNK - 10,
),
)
elif t < self._EVAL_DEMON_MIN:
self._EVAL_CHUNK = max(
self._EVAL_CHUNK + 1,
min(
int(self._EVAL_CHUNK * (self._EVAL_DEMON_MIN / t)),
self._EVAL_CHUNK + 10,
),
)
def _init_widgets(self, top):
frame0 = Frame(top, **self._FRAME_PARAMS)
frame0.grid_columnconfigure(0, weight=4)
frame0.grid_columnconfigure(3, weight=2)
frame0.grid_rowconfigure(1, weight=1)
frame0.grid_rowconfigure(5, weight=1)
# The grammar
self.grammarbox = Text(frame0, font=self._font, **self._GRAMMARBOX_PARAMS)
self.grammarlabel = Label(
frame0,
font=self._font,
text='Grammar:',
highlightcolor='black',
background=self._GRAMMARBOX_PARAMS['background'],
)
self.grammarlabel.grid(column=0, row=0, sticky='SW')
self.grammarbox.grid(column=0, row=1, sticky='NEWS')
# Scroll bar for grammar
grammar_scrollbar = Scrollbar(frame0, command=self.grammarbox.yview)
grammar_scrollbar.grid(column=1, row=1, sticky='NWS')
self.grammarbox.config(yscrollcommand=grammar_scrollbar.set)
# grammar buttons
bg = self._FRAME_PARAMS['background']
frame3 = Frame(frame0, background=bg)
frame3.grid(column=0, row=2, sticky='EW')
Button(
frame3,
text='Prev Grammar',
command=self._history_prev,
**self._BUTTON_PARAMS
).pack(side='left')
Button(
frame3,
text='Next Grammar',
command=self._history_next,
**self._BUTTON_PARAMS
).pack(side='left')
# Help box
self.helpbox = Text(frame0, font=self._smallfont, **self._HELPBOX_PARAMS)
self.helpbox.grid(column=3, row=1, sticky='NEWS')
self.helptabs = {}
bg = self._FRAME_PARAMS['background']
helptab_frame = Frame(frame0, background=bg)
helptab_frame.grid(column=3, row=0, sticky='SW')
for i, (tab, tabstops, text) in enumerate(self.HELP):
label = Label(helptab_frame, text=tab, font=self._smallfont)
label.grid(column=i * 2, row=0, sticky='S')
# help_frame.grid_columnconfigure(i, weight=1)
# label.pack(side='left')
label.bind('<ButtonPress>', lambda e, tab=tab: self.show_help(tab))
self.helptabs[tab] = label
Frame(
helptab_frame, height=1, width=self._HELPTAB_SPACER, background=bg
).grid(column=i * 2 + 1, row=0)
self.helptabs[self.HELP[0][0]].configure(font=self._font)
self.helpbox.tag_config('elide', elide=True)
for (tag, params) in self.HELP_AUTOTAG:
self.helpbox.tag_config('tag-%s' % tag, **params)
self.show_help(self.HELP[0][0])
# Scroll bar for helpbox
help_scrollbar = Scrollbar(frame0, command=self.helpbox.yview)
self.helpbox.config(yscrollcommand=help_scrollbar.set)
help_scrollbar.grid(column=4, row=1, sticky='NWS')
# The dev set
frame4 = Frame(frame0, background=self._FRAME_PARAMS['background'])
self.devsetbox = Text(frame4, font=self._font, **self._DEVSETBOX_PARAMS)
self.devsetbox.pack(expand=True, fill='both')
self.devsetlabel = Label(
frame0,
font=self._font,
text='Development Set:',
justify='right',
background=self._DEVSETBOX_PARAMS['background'],
)
self.devsetlabel.grid(column=0, row=4, sticky='SW')
frame4.grid(column=0, row=5, sticky='NEWS')
# dev set scrollbars
self.devset_scroll = Scrollbar(frame0, command=self._devset_scroll)
self.devset_scroll.grid(column=1, row=5, sticky='NWS')
self.devset_xscroll = Scrollbar(
frame4, command=self.devsetbox.xview, orient='horiz'
)
self.devsetbox['xscrollcommand'] = self.devset_xscroll.set
self.devset_xscroll.pack(side='bottom', fill='x')
# dev set buttons
bg = self._FRAME_PARAMS['background']
frame1 = Frame(frame0, background=bg)
frame1.grid(column=0, row=7, sticky='EW')
Button(
frame1,
text='Prev Example (Ctrl-p)',
command=self._devset_prev,
**self._BUTTON_PARAMS
).pack(side='left')
Button(
frame1,
text='Next Example (Ctrl-n)',
command=self._devset_next,
**self._BUTTON_PARAMS
).pack(side='left')
self.devset_button = Button(
frame1,
text='Show example',
command=self.show_devset,
state='disabled',
**self._BUTTON_PARAMS
)
self.devset_button.pack(side='right')
self.trace_button = Button(
frame1, text='Show trace', command=self.show_trace, **self._BUTTON_PARAMS
)
self.trace_button.pack(side='right')
# evaluation box
self.evalbox = Canvas(frame0, **self._EVALBOX_PARAMS)
label = Label(
frame0,
font=self._font,
text='Evaluation:',
justify='right',
background=self._EVALBOX_PARAMS['background'],
)
label.grid(column=3, row=4, sticky='SW')
self.evalbox.grid(column=3, row=5, sticky='NEWS', columnspan=2)
# evaluation box buttons
bg = self._FRAME_PARAMS['background']
frame2 = Frame(frame0, background=bg)
frame2.grid(column=3, row=7, sticky='EW')
self._autoscale = IntVar(self.top)
self._autoscale.set(False)
Checkbutton(
frame2,
variable=self._autoscale,
command=self._eval_plot,
text='Zoom',
**self._BUTTON_PARAMS
).pack(side='left')
self._eval_lines = IntVar(self.top)
self._eval_lines.set(False)
Checkbutton(
frame2,
variable=self._eval_lines,
command=self._eval_plot,
text='Lines',
**self._BUTTON_PARAMS
).pack(side='left')
Button(frame2, text='History', **self._BUTTON_PARAMS).pack(side='right')
# The status label
self.status = Label(frame0, font=self._font, **self._STATUS_PARAMS)
self.status.grid(column=0, row=9, sticky='NEW', padx=3, pady=2, columnspan=5)
# Help box & devset box can't be edited.
self.helpbox['state'] = 'disabled'
self.devsetbox['state'] = 'disabled'
# Spacers
bg = self._FRAME_PARAMS['background']
Frame(frame0, height=10, width=0, background=bg).grid(column=0, row=3)
Frame(frame0, height=0, width=10, background=bg).grid(column=2, row=0)
Frame(frame0, height=6, width=0, background=bg).grid(column=0, row=8)
# pack the frame.
frame0.pack(fill='both', expand=True)
# Set up colors for the devset box
self.devsetbox.tag_config('true-pos', background='#afa', underline='True')
self.devsetbox.tag_config('false-neg', underline='True', foreground='#800')
self.devsetbox.tag_config('false-pos', background='#faa')
self.devsetbox.tag_config('trace', foreground='#666', wrap='none')
self.devsetbox.tag_config('wrapindent', lmargin2=30, wrap='none')
self.devsetbox.tag_config('error', foreground='#800')
# And for the grammarbox
self.grammarbox.tag_config('error', background='#fec')
self.grammarbox.tag_config('comment', foreground='#840')
self.grammarbox.tag_config('angle', foreground='#00f')
self.grammarbox.tag_config('brace', foreground='#0a0')
self.grammarbox.tag_config('hangindent', lmargin1=0, lmargin2=40)
_showing_trace = False
def show_trace(self, *e):
self._showing_trace = True
self.trace_button['state'] = 'disabled'
self.devset_button['state'] = 'normal'
self.devsetbox['state'] = 'normal'
# self.devsetbox['wrap'] = 'none'
self.devsetbox.delete('1.0', 'end')
self.devsetlabel['text'] = 'Development Set (%d/%d)' % (
(self.devset_index + 1, self._devset_size.get())
)
if self.chunker is None:
self.devsetbox.insert('1.0', 'Trace: waiting for a valid grammar.')
self.devsetbox.tag_add('error', '1.0', 'end')
return # can't do anything more
gold_tree = self.devset[self.devset_index]
rules = self.chunker.rules()
# Calculate the tag sequence
tagseq = '\t'
charnum = [1]
for wordnum, (word, pos) in enumerate(gold_tree.leaves()):
tagseq += '%s ' % pos
charnum.append(len(tagseq))
self.charnum = dict(
((i, j), charnum[j])
for i in range(len(rules) + 1)
for j in range(len(charnum))
)
self.linenum = dict((i, i * 2 + 2) for i in range(len(rules) + 1))
for i in range(len(rules) + 1):
if i == 0:
self.devsetbox.insert('end', 'Start:\n')
self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
else:
self.devsetbox.insert('end', 'Apply %s:\n' % rules[i - 1])
self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
# Display the tag sequence.
self.devsetbox.insert('end', tagseq + '\n')
self.devsetbox.tag_add('wrapindent', 'end -2c linestart', 'end -2c')
# Run a partial parser, and extract gold & test chunks
chunker = RegexpChunkParser(rules[:i])
test_tree = self._chunkparse(gold_tree.leaves())
gold_chunks = self._chunks(gold_tree)
test_chunks = self._chunks(test_tree)
# Compare them.
for chunk in gold_chunks.intersection(test_chunks):
self._color_chunk(i, chunk, 'true-pos')
for chunk in gold_chunks - test_chunks:
self._color_chunk(i, chunk, 'false-neg')
for chunk in test_chunks - gold_chunks:
self._color_chunk(i, chunk, 'false-pos')
self.devsetbox.insert('end', 'Finished.\n')
self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
# This is a hack, because the x-scrollbar isn't updating its
# position right -- I'm not sure what the underlying cause is
# though. (This is on OS X w/ python 2.5)
self.top.after(100, self.devset_xscroll.set, 0, 0.3)
def show_help(self, tab):
self.helpbox['state'] = 'normal'
self.helpbox.delete('1.0', 'end')
for (name, tabstops, text) in self.HELP:
if name == tab:
text = text.replace(
'<<TAGSET>>',
'\n'.join(
(
'\t%s\t%s' % item
for item in sorted(
list(self.tagset.items()),
key=lambda t_w: re.match('\w+', t_w[0])
and (0, t_w[0])
or (1, t_w[0]),
)
)
),
)
self.helptabs[name].config(**self._HELPTAB_FG_PARAMS)
self.helpbox.config(tabs=tabstops)
self.helpbox.insert('1.0', text + '\n' * 20)
C = '1.0 + %d chars'
for (tag, params) in self.HELP_AUTOTAG:
pattern = '(?s)(<%s>)(.*?)(</%s>)' % (tag, tag)
for m in re.finditer(pattern, text):
self.helpbox.tag_add('elide', C % m.start(1), C % m.end(1))
self.helpbox.tag_add(
'tag-%s' % tag, C % m.start(2), C % m.end(2)
)
self.helpbox.tag_add('elide', C % m.start(3), C % m.end(3))
else:
self.helptabs[name].config(**self._HELPTAB_BG_PARAMS)
self.helpbox['state'] = 'disabled'
def _history_prev(self, *e):
self._view_history(self._history_index - 1)
return 'break'
def _history_next(self, *e):
self._view_history(self._history_index + 1)
return 'break'
def _view_history(self, index):
# Bounds & sanity checking:
index = max(0, min(len(self._history) - 1, index))
if not self._history:
return
# Already viewing the requested history item?
if index == self._history_index:
return
# Show the requested grammar. It will get added to _history
# only if they edit it (causing self.update() to get run.)
self.grammarbox['state'] = 'normal'
self.grammarbox.delete('1.0', 'end')
self.grammarbox.insert('end', self._history[index][0])
self.grammarbox.mark_set('insert', '1.0')
self._history_index = index
self._syntax_highlight_grammar(self._history[index][0])
# Record the normalized grammar & regenerate the chunker.
self.normalized_grammar = self.normalize_grammar(self._history[index][0])
if self.normalized_grammar:
rules = [
RegexpChunkRule.fromstring(line)
for line in self.normalized_grammar.split('\n')
]
else:
rules = []
self.chunker = RegexpChunkParser(rules)
# Show the score.
self._eval_plot()
# Update the devset box
self._highlight_devset()
if self._showing_trace:
self.show_trace()
# Update the grammar label
if self._history_index < len(self._history) - 1:
self.grammarlabel['text'] = 'Grammar %s/%s:' % (
self._history_index + 1,
len(self._history),
)
else:
self.grammarlabel['text'] = 'Grammar:'
def _devset_next(self, *e):
self._devset_scroll('scroll', 1, 'page')
return 'break'
def _devset_prev(self, *e):
self._devset_scroll('scroll', -1, 'page')
return 'break'
def destroy(self, *e):
if self.top is None:
return
self.top.destroy()
self.top = None
def _devset_scroll(self, command, *args):
N = 1 # size of a page -- one sentence.
showing_trace = self._showing_trace
if command == 'scroll' and args[1].startswith('unit'):
self.show_devset(self.devset_index + int(args[0]))
elif command == 'scroll' and args[1].startswith('page'):
self.show_devset(self.devset_index + N * int(args[0]))
elif command == 'moveto':
self.show_devset(int(float(args[0]) * self._devset_size.get()))
else:
assert 0, 'bad scroll command %s %s' % (command, args)
if showing_trace:
self.show_trace()
def show_devset(self, index=None):
if index is None:
index = self.devset_index
# Bounds checking
index = min(max(0, index), self._devset_size.get() - 1)
if index == self.devset_index and not self._showing_trace:
return
self.devset_index = index
self._showing_trace = False
self.trace_button['state'] = 'normal'
self.devset_button['state'] = 'disabled'
# Clear the text box.
self.devsetbox['state'] = 'normal'
self.devsetbox['wrap'] = 'word'
self.devsetbox.delete('1.0', 'end')
self.devsetlabel['text'] = 'Development Set (%d/%d)' % (
(self.devset_index + 1, self._devset_size.get())
)
# Add the sentences
sample = self.devset[self.devset_index : self.devset_index + 1]
self.charnum = {}
self.linenum = {0: 1}
for sentnum, sent in enumerate(sample):
linestr = ''
for wordnum, (word, pos) in enumerate(sent.leaves()):
self.charnum[sentnum, wordnum] = len(linestr)
linestr += '%s/%s ' % (word, pos)
self.charnum[sentnum, wordnum + 1] = len(linestr)
self.devsetbox.insert('end', linestr[:-1] + '\n\n')
# Highlight chunks in the dev set
if self.chunker is not None:
self._highlight_devset()
self.devsetbox['state'] = 'disabled'
# Update the scrollbar
first = self.devset_index / self._devset_size.get()
last = (self.devset_index + 2) / self._devset_size.get()
self.devset_scroll.set(first, last)
def _chunks(self, tree):
chunks = set()
wordnum = 0
for child in tree:
if isinstance(child, Tree):
if child.label() == self._chunk_label:
chunks.add((wordnum, wordnum + len(child)))
wordnum += len(child)
else:
wordnum += 1
return chunks
def _syntax_highlight_grammar(self, grammar):
if self.top is None:
return
self.grammarbox.tag_remove('comment', '1.0', 'end')
self.grammarbox.tag_remove('angle', '1.0', 'end')
self.grammarbox.tag_remove('brace', '1.0', 'end')
self.grammarbox.tag_add('hangindent', '1.0', 'end')
for lineno, line in enumerate(grammar.split('\n')):
if not line.strip():
continue
m = re.match(r'(\\.|[^#])*(#.*)?', line)
comment_start = None
if m.group(2):
comment_start = m.start(2)
s = '%d.%d' % (lineno + 1, m.start(2))
e = '%d.%d' % (lineno + 1, m.end(2))
self.grammarbox.tag_add('comment', s, e)
for m in re.finditer('[<>{}]', line):
if comment_start is not None and m.start() >= comment_start:
break
s = '%d.%d' % (lineno + 1, m.start())
e = '%d.%d' % (lineno + 1, m.end())
if m.group() in '<>':
self.grammarbox.tag_add('angle', s, e)
else:
self.grammarbox.tag_add('brace', s, e)
def _grammarcheck(self, grammar):
if self.top is None:
return
self.grammarbox.tag_remove('error', '1.0', 'end')
self._grammarcheck_errs = []
for lineno, line in enumerate(grammar.split('\n')):
line = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', line)
line = line.strip()
if line:
try:
RegexpChunkRule.fromstring(line)
except ValueError as e:
self.grammarbox.tag_add(
'error', '%s.0' % (lineno + 1), '%s.0 lineend' % (lineno + 1)
)
self.status['text'] = ''
def update(self, *event):
# Record when update was called (for grammarcheck)
if event:
self._last_keypress = time.time()
# Read the grammar from the Text box.
self.grammar = grammar = self.grammarbox.get('1.0', 'end')
# If the grammar hasn't changed, do nothing:
normalized_grammar = self.normalize_grammar(grammar)
if normalized_grammar == self.normalized_grammar:
return
else:
self.normalized_grammar = normalized_grammar
# If the grammar has changed, and we're looking at history,
# then stop looking at history.
if self._history_index < len(self._history) - 1:
self.grammarlabel['text'] = 'Grammar:'
self._syntax_highlight_grammar(grammar)
# The grammar has changed; try parsing it. If it doesn't
# parse, do nothing. (flag error location?)
try:
# Note: the normalized grammar has no blank lines.
if normalized_grammar:
rules = [
RegexpChunkRule.fromstring(line)
for line in normalized_grammar.split('\n')
]
else:
rules = []
except ValueError as e:
# Use the un-normalized grammar for error highlighting.
self._grammarcheck(grammar)
self.chunker = None
return
self.chunker = RegexpChunkParser(rules)
self.grammarbox.tag_remove('error', '1.0', 'end')
self.grammar_changed = time.time()
# Display the results
if self._showing_trace:
self.show_trace()
else:
self._highlight_devset()
# Start the eval demon
if not self._eval_demon_running:
self._eval_demon()
def _highlight_devset(self, sample=None):
if sample is None:
sample = self.devset[self.devset_index : self.devset_index + 1]
self.devsetbox.tag_remove('true-pos', '1.0', 'end')
self.devsetbox.tag_remove('false-neg', '1.0', 'end')
self.devsetbox.tag_remove('false-pos', '1.0', 'end')
# Run the grammar on the test cases.
for sentnum, gold_tree in enumerate(sample):
# Run the chunk parser
test_tree = self._chunkparse(gold_tree.leaves())
# Extract gold & test chunks
gold_chunks = self._chunks(gold_tree)
test_chunks = self._chunks(test_tree)
# Compare them.
for chunk in gold_chunks.intersection(test_chunks):
self._color_chunk(sentnum, chunk, 'true-pos')
for chunk in gold_chunks - test_chunks:
self._color_chunk(sentnum, chunk, 'false-neg')
for chunk in test_chunks - gold_chunks:
self._color_chunk(sentnum, chunk, 'false-pos')
def _chunkparse(self, words):
try:
return self.chunker.parse(words)
except (ValueError, IndexError) as e:
# There's an error somewhere in the grammar, but we're not sure
# exactly where, so just mark the whole grammar as bad.
# E.g., this is caused by: "({<NN>})"
self.grammarbox.tag_add('error', '1.0', 'end')
# Treat it as tagging nothing:
return words
def _color_chunk(self, sentnum, chunk, tag):
start, end = chunk
self.devsetbox.tag_add(
tag,
'%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, start]),
'%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, end] - 1),
)
def reset(self):
# Clear various variables
self.chunker = None
self.grammar = None
self.normalized_grammar = None
self.grammar_changed = 0
self._history = []
self._history_index = 0
# Update the on-screen display.
self.grammarbox.delete('1.0', 'end')
self.show_devset(0)
self.update()
# self._eval_plot()
SAVE_GRAMMAR_TEMPLATE = (
'# Regexp Chunk Parsing Grammar\n'
'# Saved %(date)s\n'
'#\n'
'# Development set: %(devset)s\n'
'# Precision: %(precision)s\n'
'# Recall: %(recall)s\n'
'# F-score: %(fscore)s\n\n'
'%(grammar)s\n'
)
def save_grammar(self, filename=None):
if not filename:
ftypes = [('Chunk Gramamr', '.chunk'), ('All files', '*')]
filename = asksaveasfilename(filetypes=ftypes, defaultextension='.chunk')
if not filename:
return
if self._history and self.normalized_grammar == self.normalize_grammar(
self._history[-1][0]
):
precision, recall, fscore = [
'%.2f%%' % (100 * v) for v in self._history[-1][1:]
]
elif self.chunker is None:
precision = recall = fscore = 'Grammar not well formed'
else:
precision = recall = fscore = 'Not finished evaluation yet'
with open(filename, 'w') as outfile:
outfile.write(
self.SAVE_GRAMMAR_TEMPLATE
% dict(
date=time.ctime(),
devset=self.devset_name,
precision=precision,
recall=recall,
fscore=fscore,
grammar=self.grammar.strip(),
)
)
def load_grammar(self, filename=None):
if not filename:
ftypes = [('Chunk Gramamr', '.chunk'), ('All files', '*')]
filename = askopenfilename(filetypes=ftypes, defaultextension='.chunk')
if not filename:
return
self.grammarbox.delete('1.0', 'end')
self.update()
with open(filename, 'r') as infile:
grammar = infile.read()
grammar = re.sub(
'^\# Regexp Chunk Parsing Grammar[\s\S]*' 'F-score:.*\n', '', grammar
).lstrip()
self.grammarbox.insert('1.0', grammar)
self.update()
def save_history(self, filename=None):
if not filename:
ftypes = [('Chunk Gramamr History', '.txt'), ('All files', '*')]
filename = asksaveasfilename(filetypes=ftypes, defaultextension='.txt')
if not filename:
return
with open(filename, 'w') as outfile:
outfile.write('# Regexp Chunk Parsing Grammar History\n')
outfile.write('# Saved %s\n' % time.ctime())
outfile.write('# Development set: %s\n' % self.devset_name)
for i, (g, p, r, f) in enumerate(self._history):
hdr = (
'Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, '
'fscore=%.2f%%)'
% (i + 1, len(self._history), p * 100, r * 100, f * 100)
)
outfile.write('\n%s\n' % hdr)
outfile.write(''.join(' %s\n' % line for line in g.strip().split()))
if not (
self._history
and self.normalized_grammar
== self.normalize_grammar(self._history[-1][0])
):
if self.chunker is None:
outfile.write('\nCurrent Grammar (not well-formed)\n')
else:
outfile.write('\nCurrent Grammar (not evaluated)\n')
outfile.write(
''.join(' %s\n' % line for line in self.grammar.strip().split())
)
def about(self, *e):
ABOUT = "NLTK RegExp Chunk Parser Application\n" + "Written by Edward Loper"
TITLE = 'About: Regular Expression Chunk Parser Application'
try:
from six.moves.tkinter_messagebox import Message
Message(message=ABOUT, title=TITLE).show()
except:
ShowText(self.top, TITLE, ABOUT)
def set_devset_size(self, size=None):
if size is not None:
self._devset_size.set(size)
self._devset_size.set(min(len(self.devset), self._devset_size.get()))
self.show_devset(1)
self.show_devset(0)
# what about history? Evaluated at diff dev set sizes!
def resize(self, size=None):
if size is not None:
self._size.set(size)
size = self._size.get()
self._font.configure(size=-(abs(size)))
self._smallfont.configure(size=min(-10, -(abs(size)) * 14 // 20))
def mainloop(self, *args, **kwargs):
"""
Enter the Tkinter mainloop. This function must be called if
this demo is created from a non-interactive program (e.g.
from a secript); otherwise, the demo will close as soon as
the script completes.
"""
if in_idle():
return
self.top.mainloop(*args, **kwargs)
def app():
RegexpChunkApp().mainloop()
if __name__ == '__main__':
app()
__all__ = ['app']