You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1504 lines
56 KiB
Python
1504 lines
56 KiB
Python
# Natural Language Toolkit: Regexp Chunk Parser Application
|
|
#
|
|
# Copyright (C) 2001-2020 NLTK Project
|
|
# Author: Edward Loper <edloper@gmail.com>
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
|
|
"""
|
|
A graphical tool for exploring the regular expression based chunk
|
|
parser ``nltk.chunk.RegexpChunkParser``.
|
|
"""
|
|
|
|
# Todo: Add a way to select the development set from the menubar. This
|
|
# might just need to be a selection box (conll vs treebank etc) plus
|
|
# configuration parameters to select what's being chunked (eg VP vs NP)
|
|
# and what part of the data is being used as the development set.
|
|
|
|
import time
|
|
import textwrap
|
|
import re
|
|
import random
|
|
|
|
from tkinter import (
|
|
Button,
|
|
Canvas,
|
|
Checkbutton,
|
|
Frame,
|
|
IntVar,
|
|
Label,
|
|
Menu,
|
|
Scrollbar,
|
|
Text,
|
|
Tk,
|
|
)
|
|
from tkinter.filedialog import askopenfilename, asksaveasfilename
|
|
from tkinter.font import Font
|
|
|
|
from nltk.tree import Tree
|
|
from nltk.util import in_idle
|
|
from nltk.draw.util import ShowText
|
|
from nltk.corpus import conll2000, treebank_chunk
|
|
from nltk.chunk import ChunkScore, RegexpChunkParser
|
|
from nltk.chunk.regexp import RegexpChunkRule
|
|
|
|
|
|
class RegexpChunkApp(object):
|
|
"""
|
|
A graphical tool for exploring the regular expression based chunk
|
|
parser ``nltk.chunk.RegexpChunkParser``.
|
|
|
|
See ``HELP`` for instructional text.
|
|
"""
|
|
|
|
##/////////////////////////////////////////////////////////////////
|
|
## Help Text
|
|
##/////////////////////////////////////////////////////////////////
|
|
|
|
#: A dictionary mapping from part of speech tags to descriptions,
|
|
#: which is used in the help text. (This should probably live with
|
|
#: the conll and/or treebank corpus instead.)
|
|
TAGSET = {
|
|
"CC": "Coordinating conjunction",
|
|
"PRP$": "Possessive pronoun",
|
|
"CD": "Cardinal number",
|
|
"RB": "Adverb",
|
|
"DT": "Determiner",
|
|
"RBR": "Adverb, comparative",
|
|
"EX": "Existential there",
|
|
"RBS": "Adverb, superlative",
|
|
"FW": "Foreign word",
|
|
"RP": "Particle",
|
|
"JJ": "Adjective",
|
|
"TO": "to",
|
|
"JJR": "Adjective, comparative",
|
|
"UH": "Interjection",
|
|
"JJS": "Adjective, superlative",
|
|
"VB": "Verb, base form",
|
|
"LS": "List item marker",
|
|
"VBD": "Verb, past tense",
|
|
"MD": "Modal",
|
|
"NNS": "Noun, plural",
|
|
"NN": "Noun, singular or masps",
|
|
"VBN": "Verb, past participle",
|
|
"VBZ": "Verb,3rd ps. sing. present",
|
|
"NNP": "Proper noun, singular",
|
|
"NNPS": "Proper noun plural",
|
|
"WDT": "wh-determiner",
|
|
"PDT": "Predeterminer",
|
|
"WP": "wh-pronoun",
|
|
"POS": "Possessive ending",
|
|
"WP$": "Possessive wh-pronoun",
|
|
"PRP": "Personal pronoun",
|
|
"WRB": "wh-adverb",
|
|
"(": "open parenthesis",
|
|
")": "close parenthesis",
|
|
"``": "open quote",
|
|
",": "comma",
|
|
"''": "close quote",
|
|
".": "period",
|
|
"#": "pound sign (currency marker)",
|
|
"$": "dollar sign (currency marker)",
|
|
"IN": "Preposition/subord. conjunction",
|
|
"SYM": "Symbol (mathematical or scientific)",
|
|
"VBG": "Verb, gerund/present participle",
|
|
"VBP": "Verb, non-3rd ps. sing. present",
|
|
":": "colon",
|
|
}
|
|
|
|
#: Contents for the help box. This is a list of tuples, one for
|
|
#: each help page, where each tuple has four elements:
|
|
#: - A title (displayed as a tab)
|
|
#: - A string description of tabstops (see Tkinter.Text for details)
|
|
#: - The text contents for the help page. You can use expressions
|
|
#: like <red>...</red> to colorize the text; see ``HELP_AUTOTAG``
|
|
#: for a list of tags you can use for colorizing.
|
|
HELP = [
|
|
(
|
|
"Help",
|
|
"20",
|
|
"Welcome to the regular expression chunk-parser grammar editor. "
|
|
"You can use this editor to develop and test chunk parser grammars "
|
|
"based on NLTK's RegexpChunkParser class.\n\n"
|
|
# Help box.
|
|
"Use this box ('Help') to learn more about the editor; click on the "
|
|
"tabs for help on specific topics:"
|
|
"<indent>\n"
|
|
"Rules: grammar rule types\n"
|
|
"Regexps: regular expression syntax\n"
|
|
"Tags: part of speech tags\n</indent>\n"
|
|
# Grammar.
|
|
"Use the upper-left box ('Grammar') to edit your grammar. "
|
|
"Each line of your grammar specifies a single 'rule', "
|
|
"which performs an action such as creating a chunk or merging "
|
|
"two chunks.\n\n"
|
|
# Dev set.
|
|
"The lower-left box ('Development Set') runs your grammar on the "
|
|
"development set, and displays the results. "
|
|
"Your grammar's chunks are <highlight>highlighted</highlight>, and "
|
|
"the correct (gold standard) chunks are "
|
|
"<underline>underlined</underline>. If they "
|
|
"match, they are displayed in <green>green</green>; otherwise, "
|
|
"they are displayed in <red>red</red>. The box displays a single "
|
|
"sentence from the development set at a time; use the scrollbar or "
|
|
"the next/previous buttons view additional sentences.\n\n"
|
|
# Performance
|
|
"The lower-right box ('Evaluation') tracks the performance of "
|
|
"your grammar on the development set. The 'precision' axis "
|
|
"indicates how many of your grammar's chunks are correct; and "
|
|
"the 'recall' axis indicates how many of the gold standard "
|
|
"chunks your system generated. Typically, you should try to "
|
|
"design a grammar that scores high on both metrics. The "
|
|
"exact precision and recall of the current grammar, as well "
|
|
"as their harmonic mean (the 'f-score'), are displayed in "
|
|
"the status bar at the bottom of the window.",
|
|
),
|
|
(
|
|
"Rules",
|
|
"10",
|
|
"<h1>{...regexp...}</h1>"
|
|
"<indent>\nChunk rule: creates new chunks from words matching "
|
|
"regexp.</indent>\n\n"
|
|
"<h1>}...regexp...{</h1>"
|
|
"<indent>\nChink rule: removes words matching regexp from existing "
|
|
"chunks.</indent>\n\n"
|
|
"<h1>...regexp1...}{...regexp2...</h1>"
|
|
"<indent>\nSplit rule: splits chunks that match regexp1 followed by "
|
|
"regexp2 in two.</indent>\n\n"
|
|
"<h1>...regexp...{}...regexp...</h1>"
|
|
"<indent>\nMerge rule: joins consecutive chunks that match regexp1 "
|
|
"and regexp2</indent>\n",
|
|
),
|
|
(
|
|
"Regexps",
|
|
"10 60",
|
|
# "Regular Expression Syntax Summary:\n\n"
|
|
"<h1>Pattern\t\tMatches...</h1>\n"
|
|
"<hangindent>"
|
|
"\t<<var>T</var>>\ta word with tag <var>T</var> "
|
|
"(where <var>T</var> may be a regexp).\n"
|
|
"\t<var>x</var>?\tan optional <var>x</var>\n"
|
|
"\t<var>x</var>+\ta sequence of 1 or more <var>x</var>'s\n"
|
|
"\t<var>x</var>*\ta sequence of 0 or more <var>x</var>'s\n"
|
|
"\t<var>x</var>|<var>y</var>\t<var>x</var> or <var>y</var>\n"
|
|
"\t.\tmatches any character\n"
|
|
"\t(<var>x</var>)\tTreats <var>x</var> as a group\n"
|
|
"\t# <var>x...</var>\tTreats <var>x...</var> "
|
|
"(to the end of the line) as a comment\n"
|
|
"\t\\<var>C</var>\tmatches character <var>C</var> "
|
|
"(useful when <var>C</var> is a special character "
|
|
"like + or #)\n"
|
|
"</hangindent>"
|
|
"\n<h1>Examples:</h1>\n"
|
|
"<hangindent>"
|
|
"\t<regexp><NN></regexp>\n"
|
|
'\t\tMatches <match>"cow/NN"</match>\n'
|
|
'\t\tMatches <match>"green/NN"</match>\n'
|
|
"\t<regexp><VB.*></regexp>\n"
|
|
'\t\tMatches <match>"eating/VBG"</match>\n'
|
|
'\t\tMatches <match>"ate/VBD"</match>\n'
|
|
"\t<regexp><IN><DT><NN></regexp>\n"
|
|
'\t\tMatches <match>"on/IN the/DT car/NN"</match>\n'
|
|
"\t<regexp><RB>?<VBD></regexp>\n"
|
|
'\t\tMatches <match>"ran/VBD"</match>\n'
|
|
'\t\tMatches <match>"slowly/RB ate/VBD"</match>\n'
|
|
"\t<regexp><\#><CD> # This is a comment...</regexp>\n"
|
|
'\t\tMatches <match>"#/# 100/CD"</match>\n'
|
|
"</hangindent>",
|
|
),
|
|
(
|
|
"Tags",
|
|
"10 60",
|
|
"<h1>Part of Speech Tags:</h1>\n"
|
|
+ "<hangindent>"
|
|
+ "<<TAGSET>>"
|
|
+ "</hangindent>\n", # this gets auto-substituted w/ self.TAGSET
|
|
),
|
|
]
|
|
|
|
HELP_AUTOTAG = [
|
|
("red", dict(foreground="#a00")),
|
|
("green", dict(foreground="#080")),
|
|
("highlight", dict(background="#ddd")),
|
|
("underline", dict(underline=True)),
|
|
("h1", dict(underline=True)),
|
|
("indent", dict(lmargin1=20, lmargin2=20)),
|
|
("hangindent", dict(lmargin1=0, lmargin2=60)),
|
|
("var", dict(foreground="#88f")),
|
|
("regexp", dict(foreground="#ba7")),
|
|
("match", dict(foreground="#6a6")),
|
|
]
|
|
|
|
##/////////////////////////////////////////////////////////////////
|
|
## Config Parmeters
|
|
##/////////////////////////////////////////////////////////////////
|
|
|
|
_EVAL_DELAY = 1
|
|
"""If the user has not pressed any key for this amount of time (in
|
|
seconds), and the current grammar has not been evaluated, then
|
|
the eval demon will evaluate it."""
|
|
|
|
_EVAL_CHUNK = 15
|
|
"""The number of sentences that should be evaluated by the eval
|
|
demon each time it runs."""
|
|
_EVAL_FREQ = 0.2
|
|
"""The frequency (in seconds) at which the eval demon is run"""
|
|
_EVAL_DEMON_MIN = 0.02
|
|
"""The minimum amount of time that the eval demon should take each time
|
|
it runs -- if it takes less than this time, _EVAL_CHUNK will be
|
|
modified upwards."""
|
|
_EVAL_DEMON_MAX = 0.04
|
|
"""The maximum amount of time that the eval demon should take each time
|
|
it runs -- if it takes more than this time, _EVAL_CHUNK will be
|
|
modified downwards."""
|
|
|
|
_GRAMMARBOX_PARAMS = dict(
|
|
width=40,
|
|
height=12,
|
|
background="#efe",
|
|
highlightbackground="#efe",
|
|
highlightthickness=1,
|
|
relief="groove",
|
|
border=2,
|
|
wrap="word",
|
|
)
|
|
_HELPBOX_PARAMS = dict(
|
|
width=15,
|
|
height=15,
|
|
background="#efe",
|
|
highlightbackground="#efe",
|
|
foreground="#555",
|
|
highlightthickness=1,
|
|
relief="groove",
|
|
border=2,
|
|
wrap="word",
|
|
)
|
|
_DEVSETBOX_PARAMS = dict(
|
|
width=70,
|
|
height=10,
|
|
background="#eef",
|
|
highlightbackground="#eef",
|
|
highlightthickness=1,
|
|
relief="groove",
|
|
border=2,
|
|
wrap="word",
|
|
tabs=(30,),
|
|
)
|
|
_STATUS_PARAMS = dict(background="#9bb", relief="groove", border=2)
|
|
_FONT_PARAMS = dict(family="helvetica", size=-20)
|
|
_FRAME_PARAMS = dict(background="#777", padx=2, pady=2, border=3)
|
|
_EVALBOX_PARAMS = dict(
|
|
background="#eef",
|
|
highlightbackground="#eef",
|
|
highlightthickness=1,
|
|
relief="groove",
|
|
border=2,
|
|
width=300,
|
|
height=280,
|
|
)
|
|
_BUTTON_PARAMS = dict(
|
|
background="#777", activebackground="#777", highlightbackground="#777"
|
|
)
|
|
_HELPTAB_BG_COLOR = "#aba"
|
|
_HELPTAB_FG_COLOR = "#efe"
|
|
|
|
_HELPTAB_FG_PARAMS = dict(background="#efe")
|
|
_HELPTAB_BG_PARAMS = dict(background="#aba")
|
|
_HELPTAB_SPACER = 6
|
|
|
|
def normalize_grammar(self, grammar):
|
|
# Strip comments
|
|
grammar = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", grammar)
|
|
# Normalize whitespace
|
|
grammar = re.sub(" +", " ", grammar)
|
|
grammar = re.sub("\n\s+", "\n", grammar)
|
|
grammar = grammar.strip()
|
|
# [xx] Hack: automatically backslash $!
|
|
grammar = re.sub(r"([^\\])\$", r"\1\\$", grammar)
|
|
return grammar
|
|
|
|
def __init__(
|
|
self,
|
|
devset_name="conll2000",
|
|
devset=None,
|
|
grammar="",
|
|
chunk_label="NP",
|
|
tagset=None,
|
|
):
|
|
"""
|
|
:param devset_name: The name of the development set; used for
|
|
display & for save files. If either the name 'treebank'
|
|
or the name 'conll2000' is used, and devset is None, then
|
|
devset will be set automatically.
|
|
:param devset: A list of chunked sentences
|
|
:param grammar: The initial grammar to display.
|
|
:param tagset: Dictionary from tags to string descriptions, used
|
|
for the help page. Defaults to ``self.TAGSET``.
|
|
"""
|
|
self._chunk_label = chunk_label
|
|
|
|
if tagset is None:
|
|
tagset = self.TAGSET
|
|
self.tagset = tagset
|
|
|
|
# Named development sets:
|
|
if devset is None:
|
|
if devset_name == "conll2000":
|
|
devset = conll2000.chunked_sents("train.txt") # [:100]
|
|
elif devset == "treebank":
|
|
devset = treebank_chunk.chunked_sents() # [:100]
|
|
else:
|
|
raise ValueError("Unknown development set %s" % devset_name)
|
|
|
|
self.chunker = None
|
|
"""The chunker built from the grammar string"""
|
|
|
|
self.grammar = grammar
|
|
"""The unparsed grammar string"""
|
|
|
|
self.normalized_grammar = None
|
|
"""A normalized version of ``self.grammar``."""
|
|
|
|
self.grammar_changed = 0
|
|
"""The last time() that the grammar was changed."""
|
|
|
|
self.devset = devset
|
|
"""The development set -- a list of chunked sentences."""
|
|
|
|
self.devset_name = devset_name
|
|
"""The name of the development set (for save files)."""
|
|
|
|
self.devset_index = -1
|
|
"""The index into the development set of the first instance
|
|
that's currently being viewed."""
|
|
|
|
self._last_keypress = 0
|
|
"""The time() when a key was most recently pressed"""
|
|
|
|
self._history = []
|
|
"""A list of (grammar, precision, recall, fscore) tuples for
|
|
grammars that the user has already tried."""
|
|
|
|
self._history_index = 0
|
|
"""When the user is scrolling through previous grammars, this
|
|
is used to keep track of which grammar they're looking at."""
|
|
|
|
self._eval_grammar = None
|
|
"""The grammar that is being currently evaluated by the eval
|
|
demon."""
|
|
|
|
self._eval_normalized_grammar = None
|
|
"""A normalized copy of ``_eval_grammar``."""
|
|
|
|
self._eval_index = 0
|
|
"""The index of the next sentence in the development set that
|
|
should be looked at by the eval demon."""
|
|
|
|
self._eval_score = ChunkScore(chunk_label=chunk_label)
|
|
"""The ``ChunkScore`` object that's used to keep track of the score
|
|
of the current grammar on the development set."""
|
|
|
|
# Set up the main window.
|
|
top = self.top = Tk()
|
|
top.geometry("+50+50")
|
|
top.title("Regexp Chunk Parser App")
|
|
top.bind("<Control-q>", self.destroy)
|
|
|
|
# Varaible that restricts how much of the devset we look at.
|
|
self._devset_size = IntVar(top)
|
|
self._devset_size.set(100)
|
|
|
|
# Set up all the tkinter widgets
|
|
self._init_fonts(top)
|
|
self._init_widgets(top)
|
|
self._init_bindings(top)
|
|
self._init_menubar(top)
|
|
self.grammarbox.focus()
|
|
|
|
# If a grammar was given, then display it.
|
|
if grammar:
|
|
self.grammarbox.insert("end", grammar + "\n")
|
|
self.grammarbox.mark_set("insert", "1.0")
|
|
|
|
# Display the first item in the development set
|
|
self.show_devset(0)
|
|
self.update()
|
|
|
|
def _init_bindings(self, top):
|
|
top.bind("<Control-n>", self._devset_next)
|
|
top.bind("<Control-p>", self._devset_prev)
|
|
top.bind("<Control-t>", self.toggle_show_trace)
|
|
top.bind("<KeyPress>", self.update)
|
|
top.bind("<Control-s>", lambda e: self.save_grammar())
|
|
top.bind("<Control-o>", lambda e: self.load_grammar())
|
|
self.grammarbox.bind("<Control-t>", self.toggle_show_trace)
|
|
self.grammarbox.bind("<Control-n>", self._devset_next)
|
|
self.grammarbox.bind("<Control-p>", self._devset_prev)
|
|
|
|
# Redraw the eval graph when the window size changes
|
|
self.evalbox.bind("<Configure>", self._eval_plot)
|
|
|
|
def _init_fonts(self, top):
|
|
# TWhat's our font size (default=same as sysfont)
|
|
self._size = IntVar(top)
|
|
self._size.set(20)
|
|
self._font = Font(family="helvetica", size=-self._size.get())
|
|
self._smallfont = Font(
|
|
family="helvetica", size=-(int(self._size.get() * 14 // 20))
|
|
)
|
|
|
|
def _init_menubar(self, parent):
|
|
menubar = Menu(parent)
|
|
|
|
filemenu = Menu(menubar, tearoff=0)
|
|
filemenu.add_command(label="Reset Application", underline=0, command=self.reset)
|
|
filemenu.add_command(
|
|
label="Save Current Grammar",
|
|
underline=0,
|
|
accelerator="Ctrl-s",
|
|
command=self.save_grammar,
|
|
)
|
|
filemenu.add_command(
|
|
label="Load Grammar",
|
|
underline=0,
|
|
accelerator="Ctrl-o",
|
|
command=self.load_grammar,
|
|
)
|
|
|
|
filemenu.add_command(
|
|
label="Save Grammar History", underline=13, command=self.save_history
|
|
)
|
|
|
|
filemenu.add_command(
|
|
label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
|
|
)
|
|
menubar.add_cascade(label="File", underline=0, menu=filemenu)
|
|
|
|
viewmenu = Menu(menubar, tearoff=0)
|
|
viewmenu.add_radiobutton(
|
|
label="Tiny",
|
|
variable=self._size,
|
|
underline=0,
|
|
value=10,
|
|
command=self.resize,
|
|
)
|
|
viewmenu.add_radiobutton(
|
|
label="Small",
|
|
variable=self._size,
|
|
underline=0,
|
|
value=16,
|
|
command=self.resize,
|
|
)
|
|
viewmenu.add_radiobutton(
|
|
label="Medium",
|
|
variable=self._size,
|
|
underline=0,
|
|
value=20,
|
|
command=self.resize,
|
|
)
|
|
viewmenu.add_radiobutton(
|
|
label="Large",
|
|
variable=self._size,
|
|
underline=0,
|
|
value=24,
|
|
command=self.resize,
|
|
)
|
|
viewmenu.add_radiobutton(
|
|
label="Huge",
|
|
variable=self._size,
|
|
underline=0,
|
|
value=34,
|
|
command=self.resize,
|
|
)
|
|
menubar.add_cascade(label="View", underline=0, menu=viewmenu)
|
|
|
|
devsetmenu = Menu(menubar, tearoff=0)
|
|
devsetmenu.add_radiobutton(
|
|
label="50 sentences",
|
|
variable=self._devset_size,
|
|
value=50,
|
|
command=self.set_devset_size,
|
|
)
|
|
devsetmenu.add_radiobutton(
|
|
label="100 sentences",
|
|
variable=self._devset_size,
|
|
value=100,
|
|
command=self.set_devset_size,
|
|
)
|
|
devsetmenu.add_radiobutton(
|
|
label="200 sentences",
|
|
variable=self._devset_size,
|
|
value=200,
|
|
command=self.set_devset_size,
|
|
)
|
|
devsetmenu.add_radiobutton(
|
|
label="500 sentences",
|
|
variable=self._devset_size,
|
|
value=500,
|
|
command=self.set_devset_size,
|
|
)
|
|
menubar.add_cascade(label="Development-Set", underline=0, menu=devsetmenu)
|
|
|
|
helpmenu = Menu(menubar, tearoff=0)
|
|
helpmenu.add_command(label="About", underline=0, command=self.about)
|
|
menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
|
|
|
|
parent.config(menu=menubar)
|
|
|
|
def toggle_show_trace(self, *e):
|
|
if self._showing_trace:
|
|
self.show_devset()
|
|
else:
|
|
self.show_trace()
|
|
return "break"
|
|
|
|
_SCALE_N = 5 # center on the last 5 examples.
|
|
_DRAW_LINES = False
|
|
|
|
def _eval_plot(self, *e, **config):
|
|
width = config.get("width", self.evalbox.winfo_width())
|
|
height = config.get("height", self.evalbox.winfo_height())
|
|
|
|
# Clear the canvas
|
|
self.evalbox.delete("all")
|
|
|
|
# Draw the precision & recall labels.
|
|
tag = self.evalbox.create_text(
|
|
10, height // 2 - 10, justify="left", anchor="w", text="Precision"
|
|
)
|
|
left, right = self.evalbox.bbox(tag)[2] + 5, width - 10
|
|
tag = self.evalbox.create_text(
|
|
left + (width - left) // 2,
|
|
height - 10,
|
|
anchor="s",
|
|
text="Recall",
|
|
justify="center",
|
|
)
|
|
top, bot = 10, self.evalbox.bbox(tag)[1] - 10
|
|
|
|
# Draw masks for clipping the plot.
|
|
bg = self._EVALBOX_PARAMS["background"]
|
|
self.evalbox.lower(
|
|
self.evalbox.create_rectangle(0, 0, left - 1, 5000, fill=bg, outline=bg)
|
|
)
|
|
self.evalbox.lower(
|
|
self.evalbox.create_rectangle(0, bot + 1, 5000, 5000, fill=bg, outline=bg)
|
|
)
|
|
|
|
# Calculate the plot's scale.
|
|
if self._autoscale.get() and len(self._history) > 1:
|
|
max_precision = max_recall = 0
|
|
min_precision = min_recall = 1
|
|
for i in range(1, min(len(self._history), self._SCALE_N + 1)):
|
|
grammar, precision, recall, fmeasure = self._history[-i]
|
|
min_precision = min(precision, min_precision)
|
|
min_recall = min(recall, min_recall)
|
|
max_precision = max(precision, max_precision)
|
|
max_recall = max(recall, max_recall)
|
|
# if max_precision-min_precision > max_recall-min_recall:
|
|
# min_recall -= (max_precision-min_precision)/2
|
|
# max_recall += (max_precision-min_precision)/2
|
|
# else:
|
|
# min_precision -= (max_recall-min_recall)/2
|
|
# max_precision += (max_recall-min_recall)/2
|
|
# if min_recall < 0:
|
|
# max_recall -= min_recall
|
|
# min_recall = 0
|
|
# if min_precision < 0:
|
|
# max_precision -= min_precision
|
|
# min_precision = 0
|
|
min_precision = max(min_precision - 0.01, 0)
|
|
min_recall = max(min_recall - 0.01, 0)
|
|
max_precision = min(max_precision + 0.01, 1)
|
|
max_recall = min(max_recall + 0.01, 1)
|
|
else:
|
|
min_precision = min_recall = 0
|
|
max_precision = max_recall = 1
|
|
|
|
# Draw the axis lines & grid lines
|
|
for i in range(11):
|
|
x = left + (right - left) * (
|
|
(i / 10.0 - min_recall) / (max_recall - min_recall)
|
|
)
|
|
y = bot - (bot - top) * (
|
|
(i / 10.0 - min_precision) / (max_precision - min_precision)
|
|
)
|
|
if left < x < right:
|
|
self.evalbox.create_line(x, top, x, bot, fill="#888")
|
|
if top < y < bot:
|
|
self.evalbox.create_line(left, y, right, y, fill="#888")
|
|
self.evalbox.create_line(left, top, left, bot)
|
|
self.evalbox.create_line(left, bot, right, bot)
|
|
|
|
# Display the plot's scale
|
|
self.evalbox.create_text(
|
|
left - 3,
|
|
bot,
|
|
justify="right",
|
|
anchor="se",
|
|
text="%d%%" % (100 * min_precision),
|
|
)
|
|
self.evalbox.create_text(
|
|
left - 3,
|
|
top,
|
|
justify="right",
|
|
anchor="ne",
|
|
text="%d%%" % (100 * max_precision),
|
|
)
|
|
self.evalbox.create_text(
|
|
left,
|
|
bot + 3,
|
|
justify="center",
|
|
anchor="nw",
|
|
text="%d%%" % (100 * min_recall),
|
|
)
|
|
self.evalbox.create_text(
|
|
right,
|
|
bot + 3,
|
|
justify="center",
|
|
anchor="ne",
|
|
text="%d%%" % (100 * max_recall),
|
|
)
|
|
|
|
# Display the scores.
|
|
prev_x = prev_y = None
|
|
for i, (_, precision, recall, fscore) in enumerate(self._history):
|
|
x = left + (right - left) * (
|
|
(recall - min_recall) / (max_recall - min_recall)
|
|
)
|
|
y = bot - (bot - top) * (
|
|
(precision - min_precision) / (max_precision - min_precision)
|
|
)
|
|
if i == self._history_index:
|
|
self.evalbox.create_oval(
|
|
x - 2, y - 2, x + 2, y + 2, fill="#0f0", outline="#000"
|
|
)
|
|
self.status["text"] = (
|
|
"Precision: %.2f%%\t" % (precision * 100)
|
|
+ "Recall: %.2f%%\t" % (recall * 100)
|
|
+ "F-score: %.2f%%" % (fscore * 100)
|
|
)
|
|
else:
|
|
self.evalbox.lower(
|
|
self.evalbox.create_oval(
|
|
x - 2, y - 2, x + 2, y + 2, fill="#afa", outline="#8c8"
|
|
)
|
|
)
|
|
if prev_x is not None and self._eval_lines.get():
|
|
self.evalbox.lower(
|
|
self.evalbox.create_line(prev_x, prev_y, x, y, fill="#8c8")
|
|
)
|
|
prev_x, prev_y = x, y
|
|
|
|
_eval_demon_running = False
|
|
|
|
def _eval_demon(self):
|
|
if self.top is None:
|
|
return
|
|
if self.chunker is None:
|
|
self._eval_demon_running = False
|
|
return
|
|
|
|
# Note our starting time.
|
|
t0 = time.time()
|
|
|
|
# If are still typing, then wait for them to finish.
|
|
if (
|
|
time.time() - self._last_keypress < self._EVAL_DELAY
|
|
and self.normalized_grammar != self._eval_normalized_grammar
|
|
):
|
|
self._eval_demon_running = True
|
|
return self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon)
|
|
|
|
# If the grammar changed, restart the evaluation.
|
|
if self.normalized_grammar != self._eval_normalized_grammar:
|
|
# Check if we've seen this grammar already. If so, then
|
|
# just use the old evaluation values.
|
|
for (g, p, r, f) in self._history:
|
|
if self.normalized_grammar == self.normalize_grammar(g):
|
|
self._history.append((g, p, r, f))
|
|
self._history_index = len(self._history) - 1
|
|
self._eval_plot()
|
|
self._eval_demon_running = False
|
|
self._eval_normalized_grammar = None
|
|
return
|
|
self._eval_index = 0
|
|
self._eval_score = ChunkScore(chunk_label=self._chunk_label)
|
|
self._eval_grammar = self.grammar
|
|
self._eval_normalized_grammar = self.normalized_grammar
|
|
|
|
# If the grammar is empty, the don't bother evaluating it, or
|
|
# recording it in history -- the score will just be 0.
|
|
if self.normalized_grammar.strip() == "":
|
|
# self._eval_index = self._devset_size.get()
|
|
self._eval_demon_running = False
|
|
return
|
|
|
|
# Score the next set of examples
|
|
for gold in self.devset[
|
|
self._eval_index : min(
|
|
self._eval_index + self._EVAL_CHUNK, self._devset_size.get()
|
|
)
|
|
]:
|
|
guess = self._chunkparse(gold.leaves())
|
|
self._eval_score.score(gold, guess)
|
|
|
|
# update our index in the devset.
|
|
self._eval_index += self._EVAL_CHUNK
|
|
|
|
# Check if we're done
|
|
if self._eval_index >= self._devset_size.get():
|
|
self._history.append(
|
|
(
|
|
self._eval_grammar,
|
|
self._eval_score.precision(),
|
|
self._eval_score.recall(),
|
|
self._eval_score.f_measure(),
|
|
)
|
|
)
|
|
self._history_index = len(self._history) - 1
|
|
self._eval_plot()
|
|
self._eval_demon_running = False
|
|
self._eval_normalized_grammar = None
|
|
else:
|
|
progress = 100 * self._eval_index / self._devset_size.get()
|
|
self.status["text"] = "Evaluating on Development Set (%d%%)" % progress
|
|
self._eval_demon_running = True
|
|
self._adaptively_modify_eval_chunk(time.time() - t0)
|
|
self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon)
|
|
|
|
def _adaptively_modify_eval_chunk(self, t):
|
|
"""
|
|
Modify _EVAL_CHUNK to try to keep the amount of time that the
|
|
eval demon takes between _EVAL_DEMON_MIN and _EVAL_DEMON_MAX.
|
|
|
|
:param t: The amount of time that the eval demon took.
|
|
"""
|
|
if t > self._EVAL_DEMON_MAX and self._EVAL_CHUNK > 5:
|
|
self._EVAL_CHUNK = min(
|
|
self._EVAL_CHUNK - 1,
|
|
max(
|
|
int(self._EVAL_CHUNK * (self._EVAL_DEMON_MAX / t)),
|
|
self._EVAL_CHUNK - 10,
|
|
),
|
|
)
|
|
elif t < self._EVAL_DEMON_MIN:
|
|
self._EVAL_CHUNK = max(
|
|
self._EVAL_CHUNK + 1,
|
|
min(
|
|
int(self._EVAL_CHUNK * (self._EVAL_DEMON_MIN / t)),
|
|
self._EVAL_CHUNK + 10,
|
|
),
|
|
)
|
|
|
|
def _init_widgets(self, top):
|
|
frame0 = Frame(top, **self._FRAME_PARAMS)
|
|
frame0.grid_columnconfigure(0, weight=4)
|
|
frame0.grid_columnconfigure(3, weight=2)
|
|
frame0.grid_rowconfigure(1, weight=1)
|
|
frame0.grid_rowconfigure(5, weight=1)
|
|
|
|
# The grammar
|
|
self.grammarbox = Text(frame0, font=self._font, **self._GRAMMARBOX_PARAMS)
|
|
self.grammarlabel = Label(
|
|
frame0,
|
|
font=self._font,
|
|
text="Grammar:",
|
|
highlightcolor="black",
|
|
background=self._GRAMMARBOX_PARAMS["background"],
|
|
)
|
|
self.grammarlabel.grid(column=0, row=0, sticky="SW")
|
|
self.grammarbox.grid(column=0, row=1, sticky="NEWS")
|
|
|
|
# Scroll bar for grammar
|
|
grammar_scrollbar = Scrollbar(frame0, command=self.grammarbox.yview)
|
|
grammar_scrollbar.grid(column=1, row=1, sticky="NWS")
|
|
self.grammarbox.config(yscrollcommand=grammar_scrollbar.set)
|
|
|
|
# grammar buttons
|
|
bg = self._FRAME_PARAMS["background"]
|
|
frame3 = Frame(frame0, background=bg)
|
|
frame3.grid(column=0, row=2, sticky="EW")
|
|
Button(
|
|
frame3,
|
|
text="Prev Grammar",
|
|
command=self._history_prev,
|
|
**self._BUTTON_PARAMS
|
|
).pack(side="left")
|
|
Button(
|
|
frame3,
|
|
text="Next Grammar",
|
|
command=self._history_next,
|
|
**self._BUTTON_PARAMS
|
|
).pack(side="left")
|
|
|
|
# Help box
|
|
self.helpbox = Text(frame0, font=self._smallfont, **self._HELPBOX_PARAMS)
|
|
self.helpbox.grid(column=3, row=1, sticky="NEWS")
|
|
self.helptabs = {}
|
|
bg = self._FRAME_PARAMS["background"]
|
|
helptab_frame = Frame(frame0, background=bg)
|
|
helptab_frame.grid(column=3, row=0, sticky="SW")
|
|
for i, (tab, tabstops, text) in enumerate(self.HELP):
|
|
label = Label(helptab_frame, text=tab, font=self._smallfont)
|
|
label.grid(column=i * 2, row=0, sticky="S")
|
|
# help_frame.grid_columnconfigure(i, weight=1)
|
|
# label.pack(side='left')
|
|
label.bind("<ButtonPress>", lambda e, tab=tab: self.show_help(tab))
|
|
self.helptabs[tab] = label
|
|
Frame(
|
|
helptab_frame, height=1, width=self._HELPTAB_SPACER, background=bg
|
|
).grid(column=i * 2 + 1, row=0)
|
|
self.helptabs[self.HELP[0][0]].configure(font=self._font)
|
|
self.helpbox.tag_config("elide", elide=True)
|
|
for (tag, params) in self.HELP_AUTOTAG:
|
|
self.helpbox.tag_config("tag-%s" % tag, **params)
|
|
self.show_help(self.HELP[0][0])
|
|
|
|
# Scroll bar for helpbox
|
|
help_scrollbar = Scrollbar(frame0, command=self.helpbox.yview)
|
|
self.helpbox.config(yscrollcommand=help_scrollbar.set)
|
|
help_scrollbar.grid(column=4, row=1, sticky="NWS")
|
|
|
|
# The dev set
|
|
frame4 = Frame(frame0, background=self._FRAME_PARAMS["background"])
|
|
self.devsetbox = Text(frame4, font=self._font, **self._DEVSETBOX_PARAMS)
|
|
self.devsetbox.pack(expand=True, fill="both")
|
|
self.devsetlabel = Label(
|
|
frame0,
|
|
font=self._font,
|
|
text="Development Set:",
|
|
justify="right",
|
|
background=self._DEVSETBOX_PARAMS["background"],
|
|
)
|
|
self.devsetlabel.grid(column=0, row=4, sticky="SW")
|
|
frame4.grid(column=0, row=5, sticky="NEWS")
|
|
|
|
# dev set scrollbars
|
|
self.devset_scroll = Scrollbar(frame0, command=self._devset_scroll)
|
|
self.devset_scroll.grid(column=1, row=5, sticky="NWS")
|
|
self.devset_xscroll = Scrollbar(
|
|
frame4, command=self.devsetbox.xview, orient="horiz"
|
|
)
|
|
self.devsetbox["xscrollcommand"] = self.devset_xscroll.set
|
|
self.devset_xscroll.pack(side="bottom", fill="x")
|
|
|
|
# dev set buttons
|
|
bg = self._FRAME_PARAMS["background"]
|
|
frame1 = Frame(frame0, background=bg)
|
|
frame1.grid(column=0, row=7, sticky="EW")
|
|
Button(
|
|
frame1,
|
|
text="Prev Example (Ctrl-p)",
|
|
command=self._devset_prev,
|
|
**self._BUTTON_PARAMS
|
|
).pack(side="left")
|
|
Button(
|
|
frame1,
|
|
text="Next Example (Ctrl-n)",
|
|
command=self._devset_next,
|
|
**self._BUTTON_PARAMS
|
|
).pack(side="left")
|
|
self.devset_button = Button(
|
|
frame1,
|
|
text="Show example",
|
|
command=self.show_devset,
|
|
state="disabled",
|
|
**self._BUTTON_PARAMS
|
|
)
|
|
self.devset_button.pack(side="right")
|
|
self.trace_button = Button(
|
|
frame1, text="Show trace", command=self.show_trace, **self._BUTTON_PARAMS
|
|
)
|
|
self.trace_button.pack(side="right")
|
|
|
|
# evaluation box
|
|
self.evalbox = Canvas(frame0, **self._EVALBOX_PARAMS)
|
|
label = Label(
|
|
frame0,
|
|
font=self._font,
|
|
text="Evaluation:",
|
|
justify="right",
|
|
background=self._EVALBOX_PARAMS["background"],
|
|
)
|
|
label.grid(column=3, row=4, sticky="SW")
|
|
self.evalbox.grid(column=3, row=5, sticky="NEWS", columnspan=2)
|
|
|
|
# evaluation box buttons
|
|
bg = self._FRAME_PARAMS["background"]
|
|
frame2 = Frame(frame0, background=bg)
|
|
frame2.grid(column=3, row=7, sticky="EW")
|
|
self._autoscale = IntVar(self.top)
|
|
self._autoscale.set(False)
|
|
Checkbutton(
|
|
frame2,
|
|
variable=self._autoscale,
|
|
command=self._eval_plot,
|
|
text="Zoom",
|
|
**self._BUTTON_PARAMS
|
|
).pack(side="left")
|
|
self._eval_lines = IntVar(self.top)
|
|
self._eval_lines.set(False)
|
|
Checkbutton(
|
|
frame2,
|
|
variable=self._eval_lines,
|
|
command=self._eval_plot,
|
|
text="Lines",
|
|
**self._BUTTON_PARAMS
|
|
).pack(side="left")
|
|
Button(frame2, text="History", **self._BUTTON_PARAMS).pack(side="right")
|
|
|
|
# The status label
|
|
self.status = Label(frame0, font=self._font, **self._STATUS_PARAMS)
|
|
self.status.grid(column=0, row=9, sticky="NEW", padx=3, pady=2, columnspan=5)
|
|
|
|
# Help box & devset box can't be edited.
|
|
self.helpbox["state"] = "disabled"
|
|
self.devsetbox["state"] = "disabled"
|
|
|
|
# Spacers
|
|
bg = self._FRAME_PARAMS["background"]
|
|
Frame(frame0, height=10, width=0, background=bg).grid(column=0, row=3)
|
|
Frame(frame0, height=0, width=10, background=bg).grid(column=2, row=0)
|
|
Frame(frame0, height=6, width=0, background=bg).grid(column=0, row=8)
|
|
|
|
# pack the frame.
|
|
frame0.pack(fill="both", expand=True)
|
|
|
|
# Set up colors for the devset box
|
|
self.devsetbox.tag_config("true-pos", background="#afa", underline="True")
|
|
self.devsetbox.tag_config("false-neg", underline="True", foreground="#800")
|
|
self.devsetbox.tag_config("false-pos", background="#faa")
|
|
self.devsetbox.tag_config("trace", foreground="#666", wrap="none")
|
|
self.devsetbox.tag_config("wrapindent", lmargin2=30, wrap="none")
|
|
self.devsetbox.tag_config("error", foreground="#800")
|
|
|
|
# And for the grammarbox
|
|
self.grammarbox.tag_config("error", background="#fec")
|
|
self.grammarbox.tag_config("comment", foreground="#840")
|
|
self.grammarbox.tag_config("angle", foreground="#00f")
|
|
self.grammarbox.tag_config("brace", foreground="#0a0")
|
|
self.grammarbox.tag_config("hangindent", lmargin1=0, lmargin2=40)
|
|
|
|
_showing_trace = False
|
|
|
|
def show_trace(self, *e):
|
|
self._showing_trace = True
|
|
self.trace_button["state"] = "disabled"
|
|
self.devset_button["state"] = "normal"
|
|
|
|
self.devsetbox["state"] = "normal"
|
|
# self.devsetbox['wrap'] = 'none'
|
|
self.devsetbox.delete("1.0", "end")
|
|
self.devsetlabel["text"] = "Development Set (%d/%d)" % (
|
|
(self.devset_index + 1, self._devset_size.get())
|
|
)
|
|
|
|
if self.chunker is None:
|
|
self.devsetbox.insert("1.0", "Trace: waiting for a valid grammar.")
|
|
self.devsetbox.tag_add("error", "1.0", "end")
|
|
return # can't do anything more
|
|
|
|
gold_tree = self.devset[self.devset_index]
|
|
rules = self.chunker.rules()
|
|
|
|
# Calculate the tag sequence
|
|
tagseq = "\t"
|
|
charnum = [1]
|
|
for wordnum, (word, pos) in enumerate(gold_tree.leaves()):
|
|
tagseq += "%s " % pos
|
|
charnum.append(len(tagseq))
|
|
self.charnum = dict(
|
|
((i, j), charnum[j])
|
|
for i in range(len(rules) + 1)
|
|
for j in range(len(charnum))
|
|
)
|
|
self.linenum = dict((i, i * 2 + 2) for i in range(len(rules) + 1))
|
|
|
|
for i in range(len(rules) + 1):
|
|
if i == 0:
|
|
self.devsetbox.insert("end", "Start:\n")
|
|
self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
|
|
else:
|
|
self.devsetbox.insert("end", "Apply %s:\n" % rules[i - 1])
|
|
self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
|
|
# Display the tag sequence.
|
|
self.devsetbox.insert("end", tagseq + "\n")
|
|
self.devsetbox.tag_add("wrapindent", "end -2c linestart", "end -2c")
|
|
# Run a partial parser, and extract gold & test chunks
|
|
chunker = RegexpChunkParser(rules[:i])
|
|
test_tree = self._chunkparse(gold_tree.leaves())
|
|
gold_chunks = self._chunks(gold_tree)
|
|
test_chunks = self._chunks(test_tree)
|
|
# Compare them.
|
|
for chunk in gold_chunks.intersection(test_chunks):
|
|
self._color_chunk(i, chunk, "true-pos")
|
|
for chunk in gold_chunks - test_chunks:
|
|
self._color_chunk(i, chunk, "false-neg")
|
|
for chunk in test_chunks - gold_chunks:
|
|
self._color_chunk(i, chunk, "false-pos")
|
|
self.devsetbox.insert("end", "Finished.\n")
|
|
self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
|
|
|
|
# This is a hack, because the x-scrollbar isn't updating its
|
|
# position right -- I'm not sure what the underlying cause is
|
|
# though. (This is on OS X w/ python 2.5)
|
|
self.top.after(100, self.devset_xscroll.set, 0, 0.3)
|
|
|
|
def show_help(self, tab):
|
|
self.helpbox["state"] = "normal"
|
|
self.helpbox.delete("1.0", "end")
|
|
for (name, tabstops, text) in self.HELP:
|
|
if name == tab:
|
|
text = text.replace(
|
|
"<<TAGSET>>",
|
|
"\n".join(
|
|
(
|
|
"\t%s\t%s" % item
|
|
for item in sorted(
|
|
list(self.tagset.items()),
|
|
key=lambda t_w: re.match("\w+", t_w[0])
|
|
and (0, t_w[0])
|
|
or (1, t_w[0]),
|
|
)
|
|
)
|
|
),
|
|
)
|
|
|
|
self.helptabs[name].config(**self._HELPTAB_FG_PARAMS)
|
|
self.helpbox.config(tabs=tabstops)
|
|
self.helpbox.insert("1.0", text + "\n" * 20)
|
|
C = "1.0 + %d chars"
|
|
for (tag, params) in self.HELP_AUTOTAG:
|
|
pattern = "(?s)(<%s>)(.*?)(</%s>)" % (tag, tag)
|
|
for m in re.finditer(pattern, text):
|
|
self.helpbox.tag_add("elide", C % m.start(1), C % m.end(1))
|
|
self.helpbox.tag_add(
|
|
"tag-%s" % tag, C % m.start(2), C % m.end(2)
|
|
)
|
|
self.helpbox.tag_add("elide", C % m.start(3), C % m.end(3))
|
|
else:
|
|
self.helptabs[name].config(**self._HELPTAB_BG_PARAMS)
|
|
self.helpbox["state"] = "disabled"
|
|
|
|
def _history_prev(self, *e):
|
|
self._view_history(self._history_index - 1)
|
|
return "break"
|
|
|
|
def _history_next(self, *e):
|
|
self._view_history(self._history_index + 1)
|
|
return "break"
|
|
|
|
def _view_history(self, index):
|
|
# Bounds & sanity checking:
|
|
index = max(0, min(len(self._history) - 1, index))
|
|
if not self._history:
|
|
return
|
|
# Already viewing the requested history item?
|
|
if index == self._history_index:
|
|
return
|
|
# Show the requested grammar. It will get added to _history
|
|
# only if they edit it (causing self.update() to get run.)
|
|
self.grammarbox["state"] = "normal"
|
|
self.grammarbox.delete("1.0", "end")
|
|
self.grammarbox.insert("end", self._history[index][0])
|
|
self.grammarbox.mark_set("insert", "1.0")
|
|
self._history_index = index
|
|
self._syntax_highlight_grammar(self._history[index][0])
|
|
# Record the normalized grammar & regenerate the chunker.
|
|
self.normalized_grammar = self.normalize_grammar(self._history[index][0])
|
|
if self.normalized_grammar:
|
|
rules = [
|
|
RegexpChunkRule.fromstring(line)
|
|
for line in self.normalized_grammar.split("\n")
|
|
]
|
|
else:
|
|
rules = []
|
|
self.chunker = RegexpChunkParser(rules)
|
|
# Show the score.
|
|
self._eval_plot()
|
|
# Update the devset box
|
|
self._highlight_devset()
|
|
if self._showing_trace:
|
|
self.show_trace()
|
|
# Update the grammar label
|
|
if self._history_index < len(self._history) - 1:
|
|
self.grammarlabel["text"] = "Grammar %s/%s:" % (
|
|
self._history_index + 1,
|
|
len(self._history),
|
|
)
|
|
else:
|
|
self.grammarlabel["text"] = "Grammar:"
|
|
|
|
def _devset_next(self, *e):
|
|
self._devset_scroll("scroll", 1, "page")
|
|
return "break"
|
|
|
|
def _devset_prev(self, *e):
|
|
self._devset_scroll("scroll", -1, "page")
|
|
return "break"
|
|
|
|
def destroy(self, *e):
|
|
if self.top is None:
|
|
return
|
|
self.top.destroy()
|
|
self.top = None
|
|
|
|
def _devset_scroll(self, command, *args):
|
|
N = 1 # size of a page -- one sentence.
|
|
showing_trace = self._showing_trace
|
|
if command == "scroll" and args[1].startswith("unit"):
|
|
self.show_devset(self.devset_index + int(args[0]))
|
|
elif command == "scroll" and args[1].startswith("page"):
|
|
self.show_devset(self.devset_index + N * int(args[0]))
|
|
elif command == "moveto":
|
|
self.show_devset(int(float(args[0]) * self._devset_size.get()))
|
|
else:
|
|
assert 0, "bad scroll command %s %s" % (command, args)
|
|
if showing_trace:
|
|
self.show_trace()
|
|
|
|
def show_devset(self, index=None):
|
|
if index is None:
|
|
index = self.devset_index
|
|
|
|
# Bounds checking
|
|
index = min(max(0, index), self._devset_size.get() - 1)
|
|
|
|
if index == self.devset_index and not self._showing_trace:
|
|
return
|
|
self.devset_index = index
|
|
|
|
self._showing_trace = False
|
|
self.trace_button["state"] = "normal"
|
|
self.devset_button["state"] = "disabled"
|
|
|
|
# Clear the text box.
|
|
self.devsetbox["state"] = "normal"
|
|
self.devsetbox["wrap"] = "word"
|
|
self.devsetbox.delete("1.0", "end")
|
|
self.devsetlabel["text"] = "Development Set (%d/%d)" % (
|
|
(self.devset_index + 1, self._devset_size.get())
|
|
)
|
|
|
|
# Add the sentences
|
|
sample = self.devset[self.devset_index : self.devset_index + 1]
|
|
self.charnum = {}
|
|
self.linenum = {0: 1}
|
|
for sentnum, sent in enumerate(sample):
|
|
linestr = ""
|
|
for wordnum, (word, pos) in enumerate(sent.leaves()):
|
|
self.charnum[sentnum, wordnum] = len(linestr)
|
|
linestr += "%s/%s " % (word, pos)
|
|
self.charnum[sentnum, wordnum + 1] = len(linestr)
|
|
self.devsetbox.insert("end", linestr[:-1] + "\n\n")
|
|
|
|
# Highlight chunks in the dev set
|
|
if self.chunker is not None:
|
|
self._highlight_devset()
|
|
self.devsetbox["state"] = "disabled"
|
|
|
|
# Update the scrollbar
|
|
first = self.devset_index / self._devset_size.get()
|
|
last = (self.devset_index + 2) / self._devset_size.get()
|
|
self.devset_scroll.set(first, last)
|
|
|
|
def _chunks(self, tree):
|
|
chunks = set()
|
|
wordnum = 0
|
|
for child in tree:
|
|
if isinstance(child, Tree):
|
|
if child.label() == self._chunk_label:
|
|
chunks.add((wordnum, wordnum + len(child)))
|
|
wordnum += len(child)
|
|
else:
|
|
wordnum += 1
|
|
return chunks
|
|
|
|
def _syntax_highlight_grammar(self, grammar):
|
|
if self.top is None:
|
|
return
|
|
self.grammarbox.tag_remove("comment", "1.0", "end")
|
|
self.grammarbox.tag_remove("angle", "1.0", "end")
|
|
self.grammarbox.tag_remove("brace", "1.0", "end")
|
|
self.grammarbox.tag_add("hangindent", "1.0", "end")
|
|
for lineno, line in enumerate(grammar.split("\n")):
|
|
if not line.strip():
|
|
continue
|
|
m = re.match(r"(\\.|[^#])*(#.*)?", line)
|
|
comment_start = None
|
|
if m.group(2):
|
|
comment_start = m.start(2)
|
|
s = "%d.%d" % (lineno + 1, m.start(2))
|
|
e = "%d.%d" % (lineno + 1, m.end(2))
|
|
self.grammarbox.tag_add("comment", s, e)
|
|
for m in re.finditer("[<>{}]", line):
|
|
if comment_start is not None and m.start() >= comment_start:
|
|
break
|
|
s = "%d.%d" % (lineno + 1, m.start())
|
|
e = "%d.%d" % (lineno + 1, m.end())
|
|
if m.group() in "<>":
|
|
self.grammarbox.tag_add("angle", s, e)
|
|
else:
|
|
self.grammarbox.tag_add("brace", s, e)
|
|
|
|
def _grammarcheck(self, grammar):
|
|
if self.top is None:
|
|
return
|
|
self.grammarbox.tag_remove("error", "1.0", "end")
|
|
self._grammarcheck_errs = []
|
|
for lineno, line in enumerate(grammar.split("\n")):
|
|
line = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", line)
|
|
line = line.strip()
|
|
if line:
|
|
try:
|
|
RegexpChunkRule.fromstring(line)
|
|
except ValueError as e:
|
|
self.grammarbox.tag_add(
|
|
"error", "%s.0" % (lineno + 1), "%s.0 lineend" % (lineno + 1)
|
|
)
|
|
self.status["text"] = ""
|
|
|
|
def update(self, *event):
|
|
# Record when update was called (for grammarcheck)
|
|
if event:
|
|
self._last_keypress = time.time()
|
|
|
|
# Read the grammar from the Text box.
|
|
self.grammar = grammar = self.grammarbox.get("1.0", "end")
|
|
|
|
# If the grammar hasn't changed, do nothing:
|
|
normalized_grammar = self.normalize_grammar(grammar)
|
|
if normalized_grammar == self.normalized_grammar:
|
|
return
|
|
else:
|
|
self.normalized_grammar = normalized_grammar
|
|
|
|
# If the grammar has changed, and we're looking at history,
|
|
# then stop looking at history.
|
|
if self._history_index < len(self._history) - 1:
|
|
self.grammarlabel["text"] = "Grammar:"
|
|
|
|
self._syntax_highlight_grammar(grammar)
|
|
|
|
# The grammar has changed; try parsing it. If it doesn't
|
|
# parse, do nothing. (flag error location?)
|
|
try:
|
|
# Note: the normalized grammar has no blank lines.
|
|
if normalized_grammar:
|
|
rules = [
|
|
RegexpChunkRule.fromstring(line)
|
|
for line in normalized_grammar.split("\n")
|
|
]
|
|
else:
|
|
rules = []
|
|
except ValueError as e:
|
|
# Use the un-normalized grammar for error highlighting.
|
|
self._grammarcheck(grammar)
|
|
self.chunker = None
|
|
return
|
|
|
|
self.chunker = RegexpChunkParser(rules)
|
|
self.grammarbox.tag_remove("error", "1.0", "end")
|
|
self.grammar_changed = time.time()
|
|
# Display the results
|
|
if self._showing_trace:
|
|
self.show_trace()
|
|
else:
|
|
self._highlight_devset()
|
|
# Start the eval demon
|
|
if not self._eval_demon_running:
|
|
self._eval_demon()
|
|
|
|
def _highlight_devset(self, sample=None):
|
|
if sample is None:
|
|
sample = self.devset[self.devset_index : self.devset_index + 1]
|
|
|
|
self.devsetbox.tag_remove("true-pos", "1.0", "end")
|
|
self.devsetbox.tag_remove("false-neg", "1.0", "end")
|
|
self.devsetbox.tag_remove("false-pos", "1.0", "end")
|
|
|
|
# Run the grammar on the test cases.
|
|
for sentnum, gold_tree in enumerate(sample):
|
|
# Run the chunk parser
|
|
test_tree = self._chunkparse(gold_tree.leaves())
|
|
# Extract gold & test chunks
|
|
gold_chunks = self._chunks(gold_tree)
|
|
test_chunks = self._chunks(test_tree)
|
|
# Compare them.
|
|
for chunk in gold_chunks.intersection(test_chunks):
|
|
self._color_chunk(sentnum, chunk, "true-pos")
|
|
for chunk in gold_chunks - test_chunks:
|
|
self._color_chunk(sentnum, chunk, "false-neg")
|
|
for chunk in test_chunks - gold_chunks:
|
|
self._color_chunk(sentnum, chunk, "false-pos")
|
|
|
|
def _chunkparse(self, words):
|
|
try:
|
|
return self.chunker.parse(words)
|
|
except (ValueError, IndexError) as e:
|
|
# There's an error somewhere in the grammar, but we're not sure
|
|
# exactly where, so just mark the whole grammar as bad.
|
|
# E.g., this is caused by: "({<NN>})"
|
|
self.grammarbox.tag_add("error", "1.0", "end")
|
|
# Treat it as tagging nothing:
|
|
return words
|
|
|
|
def _color_chunk(self, sentnum, chunk, tag):
|
|
start, end = chunk
|
|
self.devsetbox.tag_add(
|
|
tag,
|
|
"%s.%s" % (self.linenum[sentnum], self.charnum[sentnum, start]),
|
|
"%s.%s" % (self.linenum[sentnum], self.charnum[sentnum, end] - 1),
|
|
)
|
|
|
|
def reset(self):
|
|
# Clear various variables
|
|
self.chunker = None
|
|
self.grammar = None
|
|
self.normalized_grammar = None
|
|
self.grammar_changed = 0
|
|
self._history = []
|
|
self._history_index = 0
|
|
# Update the on-screen display.
|
|
self.grammarbox.delete("1.0", "end")
|
|
self.show_devset(0)
|
|
self.update()
|
|
# self._eval_plot()
|
|
|
|
SAVE_GRAMMAR_TEMPLATE = (
|
|
"# Regexp Chunk Parsing Grammar\n"
|
|
"# Saved %(date)s\n"
|
|
"#\n"
|
|
"# Development set: %(devset)s\n"
|
|
"# Precision: %(precision)s\n"
|
|
"# Recall: %(recall)s\n"
|
|
"# F-score: %(fscore)s\n\n"
|
|
"%(grammar)s\n"
|
|
)
|
|
|
|
def save_grammar(self, filename=None):
|
|
if not filename:
|
|
ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")]
|
|
filename = asksaveasfilename(filetypes=ftypes, defaultextension=".chunk")
|
|
if not filename:
|
|
return
|
|
if self._history and self.normalized_grammar == self.normalize_grammar(
|
|
self._history[-1][0]
|
|
):
|
|
precision, recall, fscore = [
|
|
"%.2f%%" % (100 * v) for v in self._history[-1][1:]
|
|
]
|
|
elif self.chunker is None:
|
|
precision = recall = fscore = "Grammar not well formed"
|
|
else:
|
|
precision = recall = fscore = "Not finished evaluation yet"
|
|
|
|
with open(filename, "w") as outfile:
|
|
outfile.write(
|
|
self.SAVE_GRAMMAR_TEMPLATE
|
|
% dict(
|
|
date=time.ctime(),
|
|
devset=self.devset_name,
|
|
precision=precision,
|
|
recall=recall,
|
|
fscore=fscore,
|
|
grammar=self.grammar.strip(),
|
|
)
|
|
)
|
|
|
|
def load_grammar(self, filename=None):
|
|
if not filename:
|
|
ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")]
|
|
filename = askopenfilename(filetypes=ftypes, defaultextension=".chunk")
|
|
if not filename:
|
|
return
|
|
self.grammarbox.delete("1.0", "end")
|
|
self.update()
|
|
with open(filename, "r") as infile:
|
|
grammar = infile.read()
|
|
grammar = re.sub(
|
|
"^\# Regexp Chunk Parsing Grammar[\s\S]*" "F-score:.*\n", "", grammar
|
|
).lstrip()
|
|
self.grammarbox.insert("1.0", grammar)
|
|
self.update()
|
|
|
|
def save_history(self, filename=None):
|
|
if not filename:
|
|
ftypes = [("Chunk Gramamr History", ".txt"), ("All files", "*")]
|
|
filename = asksaveasfilename(filetypes=ftypes, defaultextension=".txt")
|
|
if not filename:
|
|
return
|
|
|
|
with open(filename, "w") as outfile:
|
|
outfile.write("# Regexp Chunk Parsing Grammar History\n")
|
|
outfile.write("# Saved %s\n" % time.ctime())
|
|
outfile.write("# Development set: %s\n" % self.devset_name)
|
|
for i, (g, p, r, f) in enumerate(self._history):
|
|
hdr = (
|
|
"Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, "
|
|
"fscore=%.2f%%)"
|
|
% (i + 1, len(self._history), p * 100, r * 100, f * 100)
|
|
)
|
|
outfile.write("\n%s\n" % hdr)
|
|
outfile.write("".join(" %s\n" % line for line in g.strip().split()))
|
|
|
|
if not (
|
|
self._history
|
|
and self.normalized_grammar
|
|
== self.normalize_grammar(self._history[-1][0])
|
|
):
|
|
if self.chunker is None:
|
|
outfile.write("\nCurrent Grammar (not well-formed)\n")
|
|
else:
|
|
outfile.write("\nCurrent Grammar (not evaluated)\n")
|
|
outfile.write(
|
|
"".join(" %s\n" % line for line in self.grammar.strip().split())
|
|
)
|
|
|
|
def about(self, *e):
|
|
ABOUT = "NLTK RegExp Chunk Parser Application\n" + "Written by Edward Loper"
|
|
TITLE = "About: Regular Expression Chunk Parser Application"
|
|
try:
|
|
from tkinter.messagebox import Message
|
|
|
|
Message(message=ABOUT, title=TITLE).show()
|
|
except:
|
|
ShowText(self.top, TITLE, ABOUT)
|
|
|
|
def set_devset_size(self, size=None):
|
|
if size is not None:
|
|
self._devset_size.set(size)
|
|
self._devset_size.set(min(len(self.devset), self._devset_size.get()))
|
|
self.show_devset(1)
|
|
self.show_devset(0)
|
|
# what about history? Evaluated at diff dev set sizes!
|
|
|
|
def resize(self, size=None):
|
|
if size is not None:
|
|
self._size.set(size)
|
|
size = self._size.get()
|
|
self._font.configure(size=-(abs(size)))
|
|
self._smallfont.configure(size=min(-10, -(abs(size)) * 14 // 20))
|
|
|
|
def mainloop(self, *args, **kwargs):
|
|
"""
|
|
Enter the Tkinter mainloop. This function must be called if
|
|
this demo is created from a non-interactive program (e.g.
|
|
from a secript); otherwise, the demo will close as soon as
|
|
the script completes.
|
|
"""
|
|
if in_idle():
|
|
return
|
|
self.top.mainloop(*args, **kwargs)
|
|
|
|
|
|
def app():
|
|
RegexpChunkApp().mainloop()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app()
|
|
|
|
__all__ = ["app"]
|