# Natural Language Toolkit: Regexp Chunk Parser Application # # Copyright (C) 2001-2020 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ A graphical tool for exploring the regular expression based chunk parser ``nltk.chunk.RegexpChunkParser``. """ # Todo: Add a way to select the development set from the menubar. This # might just need to be a selection box (conll vs treebank etc) plus # configuration parameters to select what's being chunked (eg VP vs NP) # and what part of the data is being used as the development set. import time import textwrap import re import random from tkinter import ( Button, Canvas, Checkbutton, Frame, IntVar, Label, Menu, Scrollbar, Text, Tk, ) from tkinter.filedialog import askopenfilename, asksaveasfilename from tkinter.font import Font from nltk.tree import Tree from nltk.util import in_idle from nltk.draw.util import ShowText from nltk.corpus import conll2000, treebank_chunk from nltk.chunk import ChunkScore, RegexpChunkParser from nltk.chunk.regexp import RegexpChunkRule class RegexpChunkApp(object): """ A graphical tool for exploring the regular expression based chunk parser ``nltk.chunk.RegexpChunkParser``. See ``HELP`` for instructional text. """ ##///////////////////////////////////////////////////////////////// ## Help Text ##///////////////////////////////////////////////////////////////// #: A dictionary mapping from part of speech tags to descriptions, #: which is used in the help text. (This should probably live with #: the conll and/or treebank corpus instead.) TAGSET = { "CC": "Coordinating conjunction", "PRP$": "Possessive pronoun", "CD": "Cardinal number", "RB": "Adverb", "DT": "Determiner", "RBR": "Adverb, comparative", "EX": "Existential there", "RBS": "Adverb, superlative", "FW": "Foreign word", "RP": "Particle", "JJ": "Adjective", "TO": "to", "JJR": "Adjective, comparative", "UH": "Interjection", "JJS": "Adjective, superlative", "VB": "Verb, base form", "LS": "List item marker", "VBD": "Verb, past tense", "MD": "Modal", "NNS": "Noun, plural", "NN": "Noun, singular or masps", "VBN": "Verb, past participle", "VBZ": "Verb,3rd ps. sing. present", "NNP": "Proper noun, singular", "NNPS": "Proper noun plural", "WDT": "wh-determiner", "PDT": "Predeterminer", "WP": "wh-pronoun", "POS": "Possessive ending", "WP$": "Possessive wh-pronoun", "PRP": "Personal pronoun", "WRB": "wh-adverb", "(": "open parenthesis", ")": "close parenthesis", "``": "open quote", ",": "comma", "''": "close quote", ".": "period", "#": "pound sign (currency marker)", "$": "dollar sign (currency marker)", "IN": "Preposition/subord. conjunction", "SYM": "Symbol (mathematical or scientific)", "VBG": "Verb, gerund/present participle", "VBP": "Verb, non-3rd ps. sing. present", ":": "colon", } #: Contents for the help box. This is a list of tuples, one for #: each help page, where each tuple has four elements: #: - A title (displayed as a tab) #: - A string description of tabstops (see Tkinter.Text for details) #: - The text contents for the help page. You can use expressions #: like ... to colorize the text; see ``HELP_AUTOTAG`` #: for a list of tags you can use for colorizing. HELP = [ ( "Help", "20", "Welcome to the regular expression chunk-parser grammar editor. " "You can use this editor to develop and test chunk parser grammars " "based on NLTK's RegexpChunkParser class.\n\n" # Help box. "Use this box ('Help') to learn more about the editor; click on the " "tabs for help on specific topics:" "\n" "Rules: grammar rule types\n" "Regexps: regular expression syntax\n" "Tags: part of speech tags\n\n" # Grammar. "Use the upper-left box ('Grammar') to edit your grammar. " "Each line of your grammar specifies a single 'rule', " "which performs an action such as creating a chunk or merging " "two chunks.\n\n" # Dev set. "The lower-left box ('Development Set') runs your grammar on the " "development set, and displays the results. " "Your grammar's chunks are highlighted, and " "the correct (gold standard) chunks are " "underlined. If they " "match, they are displayed in green; otherwise, " "they are displayed in red. The box displays a single " "sentence from the development set at a time; use the scrollbar or " "the next/previous buttons view additional sentences.\n\n" # Performance "The lower-right box ('Evaluation') tracks the performance of " "your grammar on the development set. The 'precision' axis " "indicates how many of your grammar's chunks are correct; and " "the 'recall' axis indicates how many of the gold standard " "chunks your system generated. Typically, you should try to " "design a grammar that scores high on both metrics. The " "exact precision and recall of the current grammar, as well " "as their harmonic mean (the 'f-score'), are displayed in " "the status bar at the bottom of the window.", ), ( "Rules", "10", "

{...regexp...}

" "\nChunk rule: creates new chunks from words matching " "regexp.\n\n" "

}...regexp...{

" "\nChink rule: removes words matching regexp from existing " "chunks.\n\n" "

...regexp1...}{...regexp2...

" "\nSplit rule: splits chunks that match regexp1 followed by " "regexp2 in two.\n\n" "

...regexp...{}...regexp...

" "\nMerge rule: joins consecutive chunks that match regexp1 " "and regexp2\n", ), ( "Regexps", "10 60", # "Regular Expression Syntax Summary:\n\n" "

Pattern\t\tMatches...

\n" "" "\t<T>\ta word with tag T " "(where T may be a regexp).\n" "\tx?\tan optional x\n" "\tx+\ta sequence of 1 or more x's\n" "\tx*\ta sequence of 0 or more x's\n" "\tx|y\tx or y\n" "\t.\tmatches any character\n" "\t(x)\tTreats x as a group\n" "\t# x...\tTreats x... " "(to the end of the line) as a comment\n" "\t\\C\tmatches character C " "(useful when C is a special character " "like + or #)\n" "" "\n

Examples:

\n" "" "\t\n" '\t\tMatches "cow/NN"\n' '\t\tMatches "green/NN"\n' "\t\n" '\t\tMatches "eating/VBG"\n' '\t\tMatches "ate/VBD"\n' "\t

\n" '\t\tMatches "on/IN the/DT car/NN"\n' "\t?\n" '\t\tMatches "ran/VBD"\n' '\t\tMatches "slowly/RB ate/VBD"\n' "\t<\#> # This is a comment...\n" '\t\tMatches "#/# 100/CD"\n' "", ), ( "Tags", "10 60", "

Part of Speech Tags:

\n" + "" + "<>" + "\n", # this gets auto-substituted w/ self.TAGSET ), ] HELP_AUTOTAG = [ ("red", dict(foreground="#a00")), ("green", dict(foreground="#080")), ("highlight", dict(background="#ddd")), ("underline", dict(underline=True)), ("h1", dict(underline=True)), ("indent", dict(lmargin1=20, lmargin2=20)), ("hangindent", dict(lmargin1=0, lmargin2=60)), ("var", dict(foreground="#88f")), ("regexp", dict(foreground="#ba7")), ("match", dict(foreground="#6a6")), ] ##///////////////////////////////////////////////////////////////// ## Config Parmeters ##///////////////////////////////////////////////////////////////// _EVAL_DELAY = 1 """If the user has not pressed any key for this amount of time (in seconds), and the current grammar has not been evaluated, then the eval demon will evaluate it.""" _EVAL_CHUNK = 15 """The number of sentences that should be evaluated by the eval demon each time it runs.""" _EVAL_FREQ = 0.2 """The frequency (in seconds) at which the eval demon is run""" _EVAL_DEMON_MIN = 0.02 """The minimum amount of time that the eval demon should take each time it runs -- if it takes less than this time, _EVAL_CHUNK will be modified upwards.""" _EVAL_DEMON_MAX = 0.04 """The maximum amount of time that the eval demon should take each time it runs -- if it takes more than this time, _EVAL_CHUNK will be modified downwards.""" _GRAMMARBOX_PARAMS = dict( width=40, height=12, background="#efe", highlightbackground="#efe", highlightthickness=1, relief="groove", border=2, wrap="word", ) _HELPBOX_PARAMS = dict( width=15, height=15, background="#efe", highlightbackground="#efe", foreground="#555", highlightthickness=1, relief="groove", border=2, wrap="word", ) _DEVSETBOX_PARAMS = dict( width=70, height=10, background="#eef", highlightbackground="#eef", highlightthickness=1, relief="groove", border=2, wrap="word", tabs=(30,), ) _STATUS_PARAMS = dict(background="#9bb", relief="groove", border=2) _FONT_PARAMS = dict(family="helvetica", size=-20) _FRAME_PARAMS = dict(background="#777", padx=2, pady=2, border=3) _EVALBOX_PARAMS = dict( background="#eef", highlightbackground="#eef", highlightthickness=1, relief="groove", border=2, width=300, height=280, ) _BUTTON_PARAMS = dict( background="#777", activebackground="#777", highlightbackground="#777" ) _HELPTAB_BG_COLOR = "#aba" _HELPTAB_FG_COLOR = "#efe" _HELPTAB_FG_PARAMS = dict(background="#efe") _HELPTAB_BG_PARAMS = dict(background="#aba") _HELPTAB_SPACER = 6 def normalize_grammar(self, grammar): # Strip comments grammar = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", grammar) # Normalize whitespace grammar = re.sub(" +", " ", grammar) grammar = re.sub("\n\s+", "\n", grammar) grammar = grammar.strip() # [xx] Hack: automatically backslash $! grammar = re.sub(r"([^\\])\$", r"\1\\$", grammar) return grammar def __init__( self, devset_name="conll2000", devset=None, grammar="", chunk_label="NP", tagset=None, ): """ :param devset_name: The name of the development set; used for display & for save files. If either the name 'treebank' or the name 'conll2000' is used, and devset is None, then devset will be set automatically. :param devset: A list of chunked sentences :param grammar: The initial grammar to display. :param tagset: Dictionary from tags to string descriptions, used for the help page. Defaults to ``self.TAGSET``. """ self._chunk_label = chunk_label if tagset is None: tagset = self.TAGSET self.tagset = tagset # Named development sets: if devset is None: if devset_name == "conll2000": devset = conll2000.chunked_sents("train.txt") # [:100] elif devset == "treebank": devset = treebank_chunk.chunked_sents() # [:100] else: raise ValueError("Unknown development set %s" % devset_name) self.chunker = None """The chunker built from the grammar string""" self.grammar = grammar """The unparsed grammar string""" self.normalized_grammar = None """A normalized version of ``self.grammar``.""" self.grammar_changed = 0 """The last time() that the grammar was changed.""" self.devset = devset """The development set -- a list of chunked sentences.""" self.devset_name = devset_name """The name of the development set (for save files).""" self.devset_index = -1 """The index into the development set of the first instance that's currently being viewed.""" self._last_keypress = 0 """The time() when a key was most recently pressed""" self._history = [] """A list of (grammar, precision, recall, fscore) tuples for grammars that the user has already tried.""" self._history_index = 0 """When the user is scrolling through previous grammars, this is used to keep track of which grammar they're looking at.""" self._eval_grammar = None """The grammar that is being currently evaluated by the eval demon.""" self._eval_normalized_grammar = None """A normalized copy of ``_eval_grammar``.""" self._eval_index = 0 """The index of the next sentence in the development set that should be looked at by the eval demon.""" self._eval_score = ChunkScore(chunk_label=chunk_label) """The ``ChunkScore`` object that's used to keep track of the score of the current grammar on the development set.""" # Set up the main window. top = self.top = Tk() top.geometry("+50+50") top.title("Regexp Chunk Parser App") top.bind("", self.destroy) # Varaible that restricts how much of the devset we look at. self._devset_size = IntVar(top) self._devset_size.set(100) # Set up all the tkinter widgets self._init_fonts(top) self._init_widgets(top) self._init_bindings(top) self._init_menubar(top) self.grammarbox.focus() # If a grammar was given, then display it. if grammar: self.grammarbox.insert("end", grammar + "\n") self.grammarbox.mark_set("insert", "1.0") # Display the first item in the development set self.show_devset(0) self.update() def _init_bindings(self, top): top.bind("", self._devset_next) top.bind("", self._devset_prev) top.bind("", self.toggle_show_trace) top.bind("", self.update) top.bind("", lambda e: self.save_grammar()) top.bind("", lambda e: self.load_grammar()) self.grammarbox.bind("", self.toggle_show_trace) self.grammarbox.bind("", self._devset_next) self.grammarbox.bind("", self._devset_prev) # Redraw the eval graph when the window size changes self.evalbox.bind("", self._eval_plot) def _init_fonts(self, top): # TWhat's our font size (default=same as sysfont) self._size = IntVar(top) self._size.set(20) self._font = Font(family="helvetica", size=-self._size.get()) self._smallfont = Font( family="helvetica", size=-(int(self._size.get() * 14 // 20)) ) def _init_menubar(self, parent): menubar = Menu(parent) filemenu = Menu(menubar, tearoff=0) filemenu.add_command(label="Reset Application", underline=0, command=self.reset) filemenu.add_command( label="Save Current Grammar", underline=0, accelerator="Ctrl-s", command=self.save_grammar, ) filemenu.add_command( label="Load Grammar", underline=0, accelerator="Ctrl-o", command=self.load_grammar, ) filemenu.add_command( label="Save Grammar History", underline=13, command=self.save_history ) filemenu.add_command( label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q" ) menubar.add_cascade(label="File", underline=0, menu=filemenu) viewmenu = Menu(menubar, tearoff=0) viewmenu.add_radiobutton( label="Tiny", variable=self._size, underline=0, value=10, command=self.resize, ) viewmenu.add_radiobutton( label="Small", variable=self._size, underline=0, value=16, command=self.resize, ) viewmenu.add_radiobutton( label="Medium", variable=self._size, underline=0, value=20, command=self.resize, ) viewmenu.add_radiobutton( label="Large", variable=self._size, underline=0, value=24, command=self.resize, ) viewmenu.add_radiobutton( label="Huge", variable=self._size, underline=0, value=34, command=self.resize, ) menubar.add_cascade(label="View", underline=0, menu=viewmenu) devsetmenu = Menu(menubar, tearoff=0) devsetmenu.add_radiobutton( label="50 sentences", variable=self._devset_size, value=50, command=self.set_devset_size, ) devsetmenu.add_radiobutton( label="100 sentences", variable=self._devset_size, value=100, command=self.set_devset_size, ) devsetmenu.add_radiobutton( label="200 sentences", variable=self._devset_size, value=200, command=self.set_devset_size, ) devsetmenu.add_radiobutton( label="500 sentences", variable=self._devset_size, value=500, command=self.set_devset_size, ) menubar.add_cascade(label="Development-Set", underline=0, menu=devsetmenu) helpmenu = Menu(menubar, tearoff=0) helpmenu.add_command(label="About", underline=0, command=self.about) menubar.add_cascade(label="Help", underline=0, menu=helpmenu) parent.config(menu=menubar) def toggle_show_trace(self, *e): if self._showing_trace: self.show_devset() else: self.show_trace() return "break" _SCALE_N = 5 # center on the last 5 examples. _DRAW_LINES = False def _eval_plot(self, *e, **config): width = config.get("width", self.evalbox.winfo_width()) height = config.get("height", self.evalbox.winfo_height()) # Clear the canvas self.evalbox.delete("all") # Draw the precision & recall labels. tag = self.evalbox.create_text( 10, height // 2 - 10, justify="left", anchor="w", text="Precision" ) left, right = self.evalbox.bbox(tag)[2] + 5, width - 10 tag = self.evalbox.create_text( left + (width - left) // 2, height - 10, anchor="s", text="Recall", justify="center", ) top, bot = 10, self.evalbox.bbox(tag)[1] - 10 # Draw masks for clipping the plot. bg = self._EVALBOX_PARAMS["background"] self.evalbox.lower( self.evalbox.create_rectangle(0, 0, left - 1, 5000, fill=bg, outline=bg) ) self.evalbox.lower( self.evalbox.create_rectangle(0, bot + 1, 5000, 5000, fill=bg, outline=bg) ) # Calculate the plot's scale. if self._autoscale.get() and len(self._history) > 1: max_precision = max_recall = 0 min_precision = min_recall = 1 for i in range(1, min(len(self._history), self._SCALE_N + 1)): grammar, precision, recall, fmeasure = self._history[-i] min_precision = min(precision, min_precision) min_recall = min(recall, min_recall) max_precision = max(precision, max_precision) max_recall = max(recall, max_recall) # if max_precision-min_precision > max_recall-min_recall: # min_recall -= (max_precision-min_precision)/2 # max_recall += (max_precision-min_precision)/2 # else: # min_precision -= (max_recall-min_recall)/2 # max_precision += (max_recall-min_recall)/2 # if min_recall < 0: # max_recall -= min_recall # min_recall = 0 # if min_precision < 0: # max_precision -= min_precision # min_precision = 0 min_precision = max(min_precision - 0.01, 0) min_recall = max(min_recall - 0.01, 0) max_precision = min(max_precision + 0.01, 1) max_recall = min(max_recall + 0.01, 1) else: min_precision = min_recall = 0 max_precision = max_recall = 1 # Draw the axis lines & grid lines for i in range(11): x = left + (right - left) * ( (i / 10.0 - min_recall) / (max_recall - min_recall) ) y = bot - (bot - top) * ( (i / 10.0 - min_precision) / (max_precision - min_precision) ) if left < x < right: self.evalbox.create_line(x, top, x, bot, fill="#888") if top < y < bot: self.evalbox.create_line(left, y, right, y, fill="#888") self.evalbox.create_line(left, top, left, bot) self.evalbox.create_line(left, bot, right, bot) # Display the plot's scale self.evalbox.create_text( left - 3, bot, justify="right", anchor="se", text="%d%%" % (100 * min_precision), ) self.evalbox.create_text( left - 3, top, justify="right", anchor="ne", text="%d%%" % (100 * max_precision), ) self.evalbox.create_text( left, bot + 3, justify="center", anchor="nw", text="%d%%" % (100 * min_recall), ) self.evalbox.create_text( right, bot + 3, justify="center", anchor="ne", text="%d%%" % (100 * max_recall), ) # Display the scores. prev_x = prev_y = None for i, (_, precision, recall, fscore) in enumerate(self._history): x = left + (right - left) * ( (recall - min_recall) / (max_recall - min_recall) ) y = bot - (bot - top) * ( (precision - min_precision) / (max_precision - min_precision) ) if i == self._history_index: self.evalbox.create_oval( x - 2, y - 2, x + 2, y + 2, fill="#0f0", outline="#000" ) self.status["text"] = ( "Precision: %.2f%%\t" % (precision * 100) + "Recall: %.2f%%\t" % (recall * 100) + "F-score: %.2f%%" % (fscore * 100) ) else: self.evalbox.lower( self.evalbox.create_oval( x - 2, y - 2, x + 2, y + 2, fill="#afa", outline="#8c8" ) ) if prev_x is not None and self._eval_lines.get(): self.evalbox.lower( self.evalbox.create_line(prev_x, prev_y, x, y, fill="#8c8") ) prev_x, prev_y = x, y _eval_demon_running = False def _eval_demon(self): if self.top is None: return if self.chunker is None: self._eval_demon_running = False return # Note our starting time. t0 = time.time() # If are still typing, then wait for them to finish. if ( time.time() - self._last_keypress < self._EVAL_DELAY and self.normalized_grammar != self._eval_normalized_grammar ): self._eval_demon_running = True return self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon) # If the grammar changed, restart the evaluation. if self.normalized_grammar != self._eval_normalized_grammar: # Check if we've seen this grammar already. If so, then # just use the old evaluation values. for (g, p, r, f) in self._history: if self.normalized_grammar == self.normalize_grammar(g): self._history.append((g, p, r, f)) self._history_index = len(self._history) - 1 self._eval_plot() self._eval_demon_running = False self._eval_normalized_grammar = None return self._eval_index = 0 self._eval_score = ChunkScore(chunk_label=self._chunk_label) self._eval_grammar = self.grammar self._eval_normalized_grammar = self.normalized_grammar # If the grammar is empty, the don't bother evaluating it, or # recording it in history -- the score will just be 0. if self.normalized_grammar.strip() == "": # self._eval_index = self._devset_size.get() self._eval_demon_running = False return # Score the next set of examples for gold in self.devset[ self._eval_index : min( self._eval_index + self._EVAL_CHUNK, self._devset_size.get() ) ]: guess = self._chunkparse(gold.leaves()) self._eval_score.score(gold, guess) # update our index in the devset. self._eval_index += self._EVAL_CHUNK # Check if we're done if self._eval_index >= self._devset_size.get(): self._history.append( ( self._eval_grammar, self._eval_score.precision(), self._eval_score.recall(), self._eval_score.f_measure(), ) ) self._history_index = len(self._history) - 1 self._eval_plot() self._eval_demon_running = False self._eval_normalized_grammar = None else: progress = 100 * self._eval_index / self._devset_size.get() self.status["text"] = "Evaluating on Development Set (%d%%)" % progress self._eval_demon_running = True self._adaptively_modify_eval_chunk(time.time() - t0) self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon) def _adaptively_modify_eval_chunk(self, t): """ Modify _EVAL_CHUNK to try to keep the amount of time that the eval demon takes between _EVAL_DEMON_MIN and _EVAL_DEMON_MAX. :param t: The amount of time that the eval demon took. """ if t > self._EVAL_DEMON_MAX and self._EVAL_CHUNK > 5: self._EVAL_CHUNK = min( self._EVAL_CHUNK - 1, max( int(self._EVAL_CHUNK * (self._EVAL_DEMON_MAX / t)), self._EVAL_CHUNK - 10, ), ) elif t < self._EVAL_DEMON_MIN: self._EVAL_CHUNK = max( self._EVAL_CHUNK + 1, min( int(self._EVAL_CHUNK * (self._EVAL_DEMON_MIN / t)), self._EVAL_CHUNK + 10, ), ) def _init_widgets(self, top): frame0 = Frame(top, **self._FRAME_PARAMS) frame0.grid_columnconfigure(0, weight=4) frame0.grid_columnconfigure(3, weight=2) frame0.grid_rowconfigure(1, weight=1) frame0.grid_rowconfigure(5, weight=1) # The grammar self.grammarbox = Text(frame0, font=self._font, **self._GRAMMARBOX_PARAMS) self.grammarlabel = Label( frame0, font=self._font, text="Grammar:", highlightcolor="black", background=self._GRAMMARBOX_PARAMS["background"], ) self.grammarlabel.grid(column=0, row=0, sticky="SW") self.grammarbox.grid(column=0, row=1, sticky="NEWS") # Scroll bar for grammar grammar_scrollbar = Scrollbar(frame0, command=self.grammarbox.yview) grammar_scrollbar.grid(column=1, row=1, sticky="NWS") self.grammarbox.config(yscrollcommand=grammar_scrollbar.set) # grammar buttons bg = self._FRAME_PARAMS["background"] frame3 = Frame(frame0, background=bg) frame3.grid(column=0, row=2, sticky="EW") Button( frame3, text="Prev Grammar", command=self._history_prev, **self._BUTTON_PARAMS ).pack(side="left") Button( frame3, text="Next Grammar", command=self._history_next, **self._BUTTON_PARAMS ).pack(side="left") # Help box self.helpbox = Text(frame0, font=self._smallfont, **self._HELPBOX_PARAMS) self.helpbox.grid(column=3, row=1, sticky="NEWS") self.helptabs = {} bg = self._FRAME_PARAMS["background"] helptab_frame = Frame(frame0, background=bg) helptab_frame.grid(column=3, row=0, sticky="SW") for i, (tab, tabstops, text) in enumerate(self.HELP): label = Label(helptab_frame, text=tab, font=self._smallfont) label.grid(column=i * 2, row=0, sticky="S") # help_frame.grid_columnconfigure(i, weight=1) # label.pack(side='left') label.bind("", lambda e, tab=tab: self.show_help(tab)) self.helptabs[tab] = label Frame( helptab_frame, height=1, width=self._HELPTAB_SPACER, background=bg ).grid(column=i * 2 + 1, row=0) self.helptabs[self.HELP[0][0]].configure(font=self._font) self.helpbox.tag_config("elide", elide=True) for (tag, params) in self.HELP_AUTOTAG: self.helpbox.tag_config("tag-%s" % tag, **params) self.show_help(self.HELP[0][0]) # Scroll bar for helpbox help_scrollbar = Scrollbar(frame0, command=self.helpbox.yview) self.helpbox.config(yscrollcommand=help_scrollbar.set) help_scrollbar.grid(column=4, row=1, sticky="NWS") # The dev set frame4 = Frame(frame0, background=self._FRAME_PARAMS["background"]) self.devsetbox = Text(frame4, font=self._font, **self._DEVSETBOX_PARAMS) self.devsetbox.pack(expand=True, fill="both") self.devsetlabel = Label( frame0, font=self._font, text="Development Set:", justify="right", background=self._DEVSETBOX_PARAMS["background"], ) self.devsetlabel.grid(column=0, row=4, sticky="SW") frame4.grid(column=0, row=5, sticky="NEWS") # dev set scrollbars self.devset_scroll = Scrollbar(frame0, command=self._devset_scroll) self.devset_scroll.grid(column=1, row=5, sticky="NWS") self.devset_xscroll = Scrollbar( frame4, command=self.devsetbox.xview, orient="horiz" ) self.devsetbox["xscrollcommand"] = self.devset_xscroll.set self.devset_xscroll.pack(side="bottom", fill="x") # dev set buttons bg = self._FRAME_PARAMS["background"] frame1 = Frame(frame0, background=bg) frame1.grid(column=0, row=7, sticky="EW") Button( frame1, text="Prev Example (Ctrl-p)", command=self._devset_prev, **self._BUTTON_PARAMS ).pack(side="left") Button( frame1, text="Next Example (Ctrl-n)", command=self._devset_next, **self._BUTTON_PARAMS ).pack(side="left") self.devset_button = Button( frame1, text="Show example", command=self.show_devset, state="disabled", **self._BUTTON_PARAMS ) self.devset_button.pack(side="right") self.trace_button = Button( frame1, text="Show trace", command=self.show_trace, **self._BUTTON_PARAMS ) self.trace_button.pack(side="right") # evaluation box self.evalbox = Canvas(frame0, **self._EVALBOX_PARAMS) label = Label( frame0, font=self._font, text="Evaluation:", justify="right", background=self._EVALBOX_PARAMS["background"], ) label.grid(column=3, row=4, sticky="SW") self.evalbox.grid(column=3, row=5, sticky="NEWS", columnspan=2) # evaluation box buttons bg = self._FRAME_PARAMS["background"] frame2 = Frame(frame0, background=bg) frame2.grid(column=3, row=7, sticky="EW") self._autoscale = IntVar(self.top) self._autoscale.set(False) Checkbutton( frame2, variable=self._autoscale, command=self._eval_plot, text="Zoom", **self._BUTTON_PARAMS ).pack(side="left") self._eval_lines = IntVar(self.top) self._eval_lines.set(False) Checkbutton( frame2, variable=self._eval_lines, command=self._eval_plot, text="Lines", **self._BUTTON_PARAMS ).pack(side="left") Button(frame2, text="History", **self._BUTTON_PARAMS).pack(side="right") # The status label self.status = Label(frame0, font=self._font, **self._STATUS_PARAMS) self.status.grid(column=0, row=9, sticky="NEW", padx=3, pady=2, columnspan=5) # Help box & devset box can't be edited. self.helpbox["state"] = "disabled" self.devsetbox["state"] = "disabled" # Spacers bg = self._FRAME_PARAMS["background"] Frame(frame0, height=10, width=0, background=bg).grid(column=0, row=3) Frame(frame0, height=0, width=10, background=bg).grid(column=2, row=0) Frame(frame0, height=6, width=0, background=bg).grid(column=0, row=8) # pack the frame. frame0.pack(fill="both", expand=True) # Set up colors for the devset box self.devsetbox.tag_config("true-pos", background="#afa", underline="True") self.devsetbox.tag_config("false-neg", underline="True", foreground="#800") self.devsetbox.tag_config("false-pos", background="#faa") self.devsetbox.tag_config("trace", foreground="#666", wrap="none") self.devsetbox.tag_config("wrapindent", lmargin2=30, wrap="none") self.devsetbox.tag_config("error", foreground="#800") # And for the grammarbox self.grammarbox.tag_config("error", background="#fec") self.grammarbox.tag_config("comment", foreground="#840") self.grammarbox.tag_config("angle", foreground="#00f") self.grammarbox.tag_config("brace", foreground="#0a0") self.grammarbox.tag_config("hangindent", lmargin1=0, lmargin2=40) _showing_trace = False def show_trace(self, *e): self._showing_trace = True self.trace_button["state"] = "disabled" self.devset_button["state"] = "normal" self.devsetbox["state"] = "normal" # self.devsetbox['wrap'] = 'none' self.devsetbox.delete("1.0", "end") self.devsetlabel["text"] = "Development Set (%d/%d)" % ( (self.devset_index + 1, self._devset_size.get()) ) if self.chunker is None: self.devsetbox.insert("1.0", "Trace: waiting for a valid grammar.") self.devsetbox.tag_add("error", "1.0", "end") return # can't do anything more gold_tree = self.devset[self.devset_index] rules = self.chunker.rules() # Calculate the tag sequence tagseq = "\t" charnum = [1] for wordnum, (word, pos) in enumerate(gold_tree.leaves()): tagseq += "%s " % pos charnum.append(len(tagseq)) self.charnum = dict( ((i, j), charnum[j]) for i in range(len(rules) + 1) for j in range(len(charnum)) ) self.linenum = dict((i, i * 2 + 2) for i in range(len(rules) + 1)) for i in range(len(rules) + 1): if i == 0: self.devsetbox.insert("end", "Start:\n") self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c") else: self.devsetbox.insert("end", "Apply %s:\n" % rules[i - 1]) self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c") # Display the tag sequence. self.devsetbox.insert("end", tagseq + "\n") self.devsetbox.tag_add("wrapindent", "end -2c linestart", "end -2c") # Run a partial parser, and extract gold & test chunks chunker = RegexpChunkParser(rules[:i]) test_tree = self._chunkparse(gold_tree.leaves()) gold_chunks = self._chunks(gold_tree) test_chunks = self._chunks(test_tree) # Compare them. for chunk in gold_chunks.intersection(test_chunks): self._color_chunk(i, chunk, "true-pos") for chunk in gold_chunks - test_chunks: self._color_chunk(i, chunk, "false-neg") for chunk in test_chunks - gold_chunks: self._color_chunk(i, chunk, "false-pos") self.devsetbox.insert("end", "Finished.\n") self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c") # This is a hack, because the x-scrollbar isn't updating its # position right -- I'm not sure what the underlying cause is # though. (This is on OS X w/ python 2.5) self.top.after(100, self.devset_xscroll.set, 0, 0.3) def show_help(self, tab): self.helpbox["state"] = "normal" self.helpbox.delete("1.0", "end") for (name, tabstops, text) in self.HELP: if name == tab: text = text.replace( "<>", "\n".join( ( "\t%s\t%s" % item for item in sorted( list(self.tagset.items()), key=lambda t_w: re.match("\w+", t_w[0]) and (0, t_w[0]) or (1, t_w[0]), ) ) ), ) self.helptabs[name].config(**self._HELPTAB_FG_PARAMS) self.helpbox.config(tabs=tabstops) self.helpbox.insert("1.0", text + "\n" * 20) C = "1.0 + %d chars" for (tag, params) in self.HELP_AUTOTAG: pattern = "(?s)(<%s>)(.*?)()" % (tag, tag) for m in re.finditer(pattern, text): self.helpbox.tag_add("elide", C % m.start(1), C % m.end(1)) self.helpbox.tag_add( "tag-%s" % tag, C % m.start(2), C % m.end(2) ) self.helpbox.tag_add("elide", C % m.start(3), C % m.end(3)) else: self.helptabs[name].config(**self._HELPTAB_BG_PARAMS) self.helpbox["state"] = "disabled" def _history_prev(self, *e): self._view_history(self._history_index - 1) return "break" def _history_next(self, *e): self._view_history(self._history_index + 1) return "break" def _view_history(self, index): # Bounds & sanity checking: index = max(0, min(len(self._history) - 1, index)) if not self._history: return # Already viewing the requested history item? if index == self._history_index: return # Show the requested grammar. It will get added to _history # only if they edit it (causing self.update() to get run.) self.grammarbox["state"] = "normal" self.grammarbox.delete("1.0", "end") self.grammarbox.insert("end", self._history[index][0]) self.grammarbox.mark_set("insert", "1.0") self._history_index = index self._syntax_highlight_grammar(self._history[index][0]) # Record the normalized grammar & regenerate the chunker. self.normalized_grammar = self.normalize_grammar(self._history[index][0]) if self.normalized_grammar: rules = [ RegexpChunkRule.fromstring(line) for line in self.normalized_grammar.split("\n") ] else: rules = [] self.chunker = RegexpChunkParser(rules) # Show the score. self._eval_plot() # Update the devset box self._highlight_devset() if self._showing_trace: self.show_trace() # Update the grammar label if self._history_index < len(self._history) - 1: self.grammarlabel["text"] = "Grammar %s/%s:" % ( self._history_index + 1, len(self._history), ) else: self.grammarlabel["text"] = "Grammar:" def _devset_next(self, *e): self._devset_scroll("scroll", 1, "page") return "break" def _devset_prev(self, *e): self._devset_scroll("scroll", -1, "page") return "break" def destroy(self, *e): if self.top is None: return self.top.destroy() self.top = None def _devset_scroll(self, command, *args): N = 1 # size of a page -- one sentence. showing_trace = self._showing_trace if command == "scroll" and args[1].startswith("unit"): self.show_devset(self.devset_index + int(args[0])) elif command == "scroll" and args[1].startswith("page"): self.show_devset(self.devset_index + N * int(args[0])) elif command == "moveto": self.show_devset(int(float(args[0]) * self._devset_size.get())) else: assert 0, "bad scroll command %s %s" % (command, args) if showing_trace: self.show_trace() def show_devset(self, index=None): if index is None: index = self.devset_index # Bounds checking index = min(max(0, index), self._devset_size.get() - 1) if index == self.devset_index and not self._showing_trace: return self.devset_index = index self._showing_trace = False self.trace_button["state"] = "normal" self.devset_button["state"] = "disabled" # Clear the text box. self.devsetbox["state"] = "normal" self.devsetbox["wrap"] = "word" self.devsetbox.delete("1.0", "end") self.devsetlabel["text"] = "Development Set (%d/%d)" % ( (self.devset_index + 1, self._devset_size.get()) ) # Add the sentences sample = self.devset[self.devset_index : self.devset_index + 1] self.charnum = {} self.linenum = {0: 1} for sentnum, sent in enumerate(sample): linestr = "" for wordnum, (word, pos) in enumerate(sent.leaves()): self.charnum[sentnum, wordnum] = len(linestr) linestr += "%s/%s " % (word, pos) self.charnum[sentnum, wordnum + 1] = len(linestr) self.devsetbox.insert("end", linestr[:-1] + "\n\n") # Highlight chunks in the dev set if self.chunker is not None: self._highlight_devset() self.devsetbox["state"] = "disabled" # Update the scrollbar first = self.devset_index / self._devset_size.get() last = (self.devset_index + 2) / self._devset_size.get() self.devset_scroll.set(first, last) def _chunks(self, tree): chunks = set() wordnum = 0 for child in tree: if isinstance(child, Tree): if child.label() == self._chunk_label: chunks.add((wordnum, wordnum + len(child))) wordnum += len(child) else: wordnum += 1 return chunks def _syntax_highlight_grammar(self, grammar): if self.top is None: return self.grammarbox.tag_remove("comment", "1.0", "end") self.grammarbox.tag_remove("angle", "1.0", "end") self.grammarbox.tag_remove("brace", "1.0", "end") self.grammarbox.tag_add("hangindent", "1.0", "end") for lineno, line in enumerate(grammar.split("\n")): if not line.strip(): continue m = re.match(r"(\\.|[^#])*(#.*)?", line) comment_start = None if m.group(2): comment_start = m.start(2) s = "%d.%d" % (lineno + 1, m.start(2)) e = "%d.%d" % (lineno + 1, m.end(2)) self.grammarbox.tag_add("comment", s, e) for m in re.finditer("[<>{}]", line): if comment_start is not None and m.start() >= comment_start: break s = "%d.%d" % (lineno + 1, m.start()) e = "%d.%d" % (lineno + 1, m.end()) if m.group() in "<>": self.grammarbox.tag_add("angle", s, e) else: self.grammarbox.tag_add("brace", s, e) def _grammarcheck(self, grammar): if self.top is None: return self.grammarbox.tag_remove("error", "1.0", "end") self._grammarcheck_errs = [] for lineno, line in enumerate(grammar.split("\n")): line = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", line) line = line.strip() if line: try: RegexpChunkRule.fromstring(line) except ValueError as e: self.grammarbox.tag_add( "error", "%s.0" % (lineno + 1), "%s.0 lineend" % (lineno + 1) ) self.status["text"] = "" def update(self, *event): # Record when update was called (for grammarcheck) if event: self._last_keypress = time.time() # Read the grammar from the Text box. self.grammar = grammar = self.grammarbox.get("1.0", "end") # If the grammar hasn't changed, do nothing: normalized_grammar = self.normalize_grammar(grammar) if normalized_grammar == self.normalized_grammar: return else: self.normalized_grammar = normalized_grammar # If the grammar has changed, and we're looking at history, # then stop looking at history. if self._history_index < len(self._history) - 1: self.grammarlabel["text"] = "Grammar:" self._syntax_highlight_grammar(grammar) # The grammar has changed; try parsing it. If it doesn't # parse, do nothing. (flag error location?) try: # Note: the normalized grammar has no blank lines. if normalized_grammar: rules = [ RegexpChunkRule.fromstring(line) for line in normalized_grammar.split("\n") ] else: rules = [] except ValueError as e: # Use the un-normalized grammar for error highlighting. self._grammarcheck(grammar) self.chunker = None return self.chunker = RegexpChunkParser(rules) self.grammarbox.tag_remove("error", "1.0", "end") self.grammar_changed = time.time() # Display the results if self._showing_trace: self.show_trace() else: self._highlight_devset() # Start the eval demon if not self._eval_demon_running: self._eval_demon() def _highlight_devset(self, sample=None): if sample is None: sample = self.devset[self.devset_index : self.devset_index + 1] self.devsetbox.tag_remove("true-pos", "1.0", "end") self.devsetbox.tag_remove("false-neg", "1.0", "end") self.devsetbox.tag_remove("false-pos", "1.0", "end") # Run the grammar on the test cases. for sentnum, gold_tree in enumerate(sample): # Run the chunk parser test_tree = self._chunkparse(gold_tree.leaves()) # Extract gold & test chunks gold_chunks = self._chunks(gold_tree) test_chunks = self._chunks(test_tree) # Compare them. for chunk in gold_chunks.intersection(test_chunks): self._color_chunk(sentnum, chunk, "true-pos") for chunk in gold_chunks - test_chunks: self._color_chunk(sentnum, chunk, "false-neg") for chunk in test_chunks - gold_chunks: self._color_chunk(sentnum, chunk, "false-pos") def _chunkparse(self, words): try: return self.chunker.parse(words) except (ValueError, IndexError) as e: # There's an error somewhere in the grammar, but we're not sure # exactly where, so just mark the whole grammar as bad. # E.g., this is caused by: "({})" self.grammarbox.tag_add("error", "1.0", "end") # Treat it as tagging nothing: return words def _color_chunk(self, sentnum, chunk, tag): start, end = chunk self.devsetbox.tag_add( tag, "%s.%s" % (self.linenum[sentnum], self.charnum[sentnum, start]), "%s.%s" % (self.linenum[sentnum], self.charnum[sentnum, end] - 1), ) def reset(self): # Clear various variables self.chunker = None self.grammar = None self.normalized_grammar = None self.grammar_changed = 0 self._history = [] self._history_index = 0 # Update the on-screen display. self.grammarbox.delete("1.0", "end") self.show_devset(0) self.update() # self._eval_plot() SAVE_GRAMMAR_TEMPLATE = ( "# Regexp Chunk Parsing Grammar\n" "# Saved %(date)s\n" "#\n" "# Development set: %(devset)s\n" "# Precision: %(precision)s\n" "# Recall: %(recall)s\n" "# F-score: %(fscore)s\n\n" "%(grammar)s\n" ) def save_grammar(self, filename=None): if not filename: ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")] filename = asksaveasfilename(filetypes=ftypes, defaultextension=".chunk") if not filename: return if self._history and self.normalized_grammar == self.normalize_grammar( self._history[-1][0] ): precision, recall, fscore = [ "%.2f%%" % (100 * v) for v in self._history[-1][1:] ] elif self.chunker is None: precision = recall = fscore = "Grammar not well formed" else: precision = recall = fscore = "Not finished evaluation yet" with open(filename, "w") as outfile: outfile.write( self.SAVE_GRAMMAR_TEMPLATE % dict( date=time.ctime(), devset=self.devset_name, precision=precision, recall=recall, fscore=fscore, grammar=self.grammar.strip(), ) ) def load_grammar(self, filename=None): if not filename: ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")] filename = askopenfilename(filetypes=ftypes, defaultextension=".chunk") if not filename: return self.grammarbox.delete("1.0", "end") self.update() with open(filename, "r") as infile: grammar = infile.read() grammar = re.sub( "^\# Regexp Chunk Parsing Grammar[\s\S]*" "F-score:.*\n", "", grammar ).lstrip() self.grammarbox.insert("1.0", grammar) self.update() def save_history(self, filename=None): if not filename: ftypes = [("Chunk Gramamr History", ".txt"), ("All files", "*")] filename = asksaveasfilename(filetypes=ftypes, defaultextension=".txt") if not filename: return with open(filename, "w") as outfile: outfile.write("# Regexp Chunk Parsing Grammar History\n") outfile.write("# Saved %s\n" % time.ctime()) outfile.write("# Development set: %s\n" % self.devset_name) for i, (g, p, r, f) in enumerate(self._history): hdr = ( "Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, " "fscore=%.2f%%)" % (i + 1, len(self._history), p * 100, r * 100, f * 100) ) outfile.write("\n%s\n" % hdr) outfile.write("".join(" %s\n" % line for line in g.strip().split())) if not ( self._history and self.normalized_grammar == self.normalize_grammar(self._history[-1][0]) ): if self.chunker is None: outfile.write("\nCurrent Grammar (not well-formed)\n") else: outfile.write("\nCurrent Grammar (not evaluated)\n") outfile.write( "".join(" %s\n" % line for line in self.grammar.strip().split()) ) def about(self, *e): ABOUT = "NLTK RegExp Chunk Parser Application\n" + "Written by Edward Loper" TITLE = "About: Regular Expression Chunk Parser Application" try: from tkinter.messagebox import Message Message(message=ABOUT, title=TITLE).show() except: ShowText(self.top, TITLE, ABOUT) def set_devset_size(self, size=None): if size is not None: self._devset_size.set(size) self._devset_size.set(min(len(self.devset), self._devset_size.get())) self.show_devset(1) self.show_devset(0) # what about history? Evaluated at diff dev set sizes! def resize(self, size=None): if size is not None: self._size.set(size) size = self._size.get() self._font.configure(size=-(abs(size))) self._smallfont.configure(size=min(-10, -(abs(size)) * 14 // 20)) def mainloop(self, *args, **kwargs): """ Enter the Tkinter mainloop. This function must be called if this demo is created from a non-interactive program (e.g. from a secript); otherwise, the demo will close as soon as the script completes. """ if in_idle(): return self.top.mainloop(*args, **kwargs) def app(): RegexpChunkApp().mainloop() if __name__ == "__main__": app() __all__ = ["app"]