You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

443 lines
14 KiB
Python

# Natural Language Toolkit: Collocations Application
# Much of the GUI code is imported from concordance.py; We intend to merge these tools together
# Copyright (C) 2001-2019 NLTK Project
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
#
from __future__ import division
import threading
from six.moves import queue as q
from six.moves.tkinter_font import Font
from six.moves.tkinter import (
Button,
END,
Frame,
IntVar,
LEFT,
Label,
Menu,
OptionMenu,
SUNKEN,
Scrollbar,
StringVar,
Text,
Tk,
)
from nltk.corpus import (
cess_cat,
brown,
nps_chat,
treebank,
sinica_treebank,
alpino,
indian,
floresta,
mac_morpho,
machado,
cess_esp,
)
from nltk.util import in_idle
from nltk.probability import FreqDist
CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'
POLL_INTERVAL = 100
_DEFAULT = 'English: Brown Corpus (Humor)'
_CORPORA = {
'Catalan: CESS-CAT Corpus': lambda: cess_cat.words(),
'English: Brown Corpus': lambda: brown.words(),
'English: Brown Corpus (Press)': lambda: brown.words(
categories=['news', 'editorial', 'reviews']
),
'English: Brown Corpus (Religion)': lambda: brown.words(categories='religion'),
'English: Brown Corpus (Learned)': lambda: brown.words(categories='learned'),
'English: Brown Corpus (Science Fiction)': lambda: brown.words(
categories='science_fiction'
),
'English: Brown Corpus (Romance)': lambda: brown.words(categories='romance'),
'English: Brown Corpus (Humor)': lambda: brown.words(categories='humor'),
'English: NPS Chat Corpus': lambda: nps_chat.words(),
'English: Wall Street Journal Corpus': lambda: treebank.words(),
'Chinese: Sinica Corpus': lambda: sinica_treebank.words(),
'Dutch: Alpino Corpus': lambda: alpino.words(),
'Hindi: Indian Languages Corpus': lambda: indian.words(files='hindi.pos'),
'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.words(),
'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.words(),
'Portuguese: Machado Corpus (Brazil)': lambda: machado.words(),
'Spanish: CESS-ESP Corpus': lambda: cess_esp.words(),
}
class CollocationsView:
_BACKGROUND_COLOUR = '#FFF' # white
def __init__(self):
self.queue = q.Queue()
self.model = CollocationsModel(self.queue)
self.top = Tk()
self._init_top(self.top)
self._init_menubar()
self._init_widgets(self.top)
self.load_corpus(self.model.DEFAULT_CORPUS)
self.after = self.top.after(POLL_INTERVAL, self._poll)
def _init_top(self, top):
top.geometry('550x650+50+50')
top.title('NLTK Collocations List')
top.bind('<Control-q>', self.destroy)
top.protocol('WM_DELETE_WINDOW', self.destroy)
top.minsize(550, 650)
def _init_widgets(self, parent):
self.main_frame = Frame(
parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)
)
self._init_corpus_select(self.main_frame)
self._init_results_box(self.main_frame)
self._init_paging(self.main_frame)
self._init_status(self.main_frame)
self.main_frame.pack(fill='both', expand=True)
def _init_corpus_select(self, parent):
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
self.var = StringVar(innerframe)
self.var.set(self.model.DEFAULT_CORPUS)
Label(
innerframe,
justify=LEFT,
text=' Corpus: ',
background=self._BACKGROUND_COLOUR,
padx=2,
pady=1,
border=0,
).pack(side='left')
other_corpora = list(self.model.CORPORA.keys()).remove(
self.model.DEFAULT_CORPUS
)
om = OptionMenu(
innerframe,
self.var,
self.model.DEFAULT_CORPUS,
command=self.corpus_selected,
*self.model.non_default_corpora()
)
om['borderwidth'] = 0
om['highlightthickness'] = 1
om.pack(side='left')
innerframe.pack(side='top', fill='x', anchor='n')
def _init_status(self, parent):
self.status = Label(
parent,
justify=LEFT,
relief=SUNKEN,
background=self._BACKGROUND_COLOUR,
border=0,
padx=1,
pady=0,
)
self.status.pack(side='top', anchor='sw')
def _init_menubar(self):
self._result_size = IntVar(self.top)
menubar = Menu(self.top)
filemenu = Menu(menubar, tearoff=0, borderwidth=0)
filemenu.add_command(
label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-q'
)
menubar.add_cascade(label='File', underline=0, menu=filemenu)
editmenu = Menu(menubar, tearoff=0)
rescntmenu = Menu(editmenu, tearoff=0)
rescntmenu.add_radiobutton(
label='20',
variable=self._result_size,
underline=0,
value=20,
command=self.set_result_size,
)
rescntmenu.add_radiobutton(
label='50',
variable=self._result_size,
underline=0,
value=50,
command=self.set_result_size,
)
rescntmenu.add_radiobutton(
label='100',
variable=self._result_size,
underline=0,
value=100,
command=self.set_result_size,
)
rescntmenu.invoke(1)
editmenu.add_cascade(label='Result Count', underline=0, menu=rescntmenu)
menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
self.top.config(menu=menubar)
def set_result_size(self, **kwargs):
self.model.result_count = self._result_size.get()
def _init_results_box(self, parent):
innerframe = Frame(parent)
i1 = Frame(innerframe)
i2 = Frame(innerframe)
vscrollbar = Scrollbar(i1, borderwidth=1)
hscrollbar = Scrollbar(i2, borderwidth=1, orient='horiz')
self.results_box = Text(
i1,
font=Font(family='courier', size='16'),
state='disabled',
borderwidth=1,
yscrollcommand=vscrollbar.set,
xscrollcommand=hscrollbar.set,
wrap='none',
width='40',
height='20',
exportselection=1,
)
self.results_box.pack(side='left', fill='both', expand=True)
vscrollbar.pack(side='left', fill='y', anchor='e')
vscrollbar.config(command=self.results_box.yview)
hscrollbar.pack(side='left', fill='x', expand=True, anchor='w')
hscrollbar.config(command=self.results_box.xview)
# there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
Label(i2, text=' ', background=self._BACKGROUND_COLOUR).pack(
side='left', anchor='e'
)
i1.pack(side='top', fill='both', expand=True, anchor='n')
i2.pack(side='bottom', fill='x', anchor='s')
innerframe.pack(side='top', fill='both', expand=True)
def _init_paging(self, parent):
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
self.prev = prev = Button(
innerframe,
text='Previous',
command=self.previous,
width='10',
borderwidth=1,
highlightthickness=1,
state='disabled',
)
prev.pack(side='left', anchor='center')
self.next = next = Button(
innerframe,
text='Next',
command=self.__next__,
width='10',
borderwidth=1,
highlightthickness=1,
state='disabled',
)
next.pack(side='right', anchor='center')
innerframe.pack(side='top', fill='y')
self.reset_current_page()
def reset_current_page(self):
self.current_page = -1
def _poll(self):
try:
event = self.queue.get(block=False)
except q.Empty:
pass
else:
if event == CORPUS_LOADED_EVENT:
self.handle_corpus_loaded(event)
elif event == ERROR_LOADING_CORPUS_EVENT:
self.handle_error_loading_corpus(event)
self.after = self.top.after(POLL_INTERVAL, self._poll)
def handle_error_loading_corpus(self, event):
self.status['text'] = 'Error in loading ' + self.var.get()
self.unfreeze_editable()
self.clear_results_box()
self.freeze_editable()
self.reset_current_page()
def handle_corpus_loaded(self, event):
self.status['text'] = self.var.get() + ' is loaded'
self.unfreeze_editable()
self.clear_results_box()
self.reset_current_page()
# self.next()
collocations = self.model.next(self.current_page + 1)
self.write_results(collocations)
self.current_page += 1
def corpus_selected(self, *args):
new_selection = self.var.get()
self.load_corpus(new_selection)
def previous(self):
self.freeze_editable()
collocations = self.model.prev(self.current_page - 1)
self.current_page = self.current_page - 1
self.clear_results_box()
self.write_results(collocations)
self.unfreeze_editable()
def __next__(self):
self.freeze_editable()
collocations = self.model.next(self.current_page + 1)
self.clear_results_box()
self.write_results(collocations)
self.current_page += 1
self.unfreeze_editable()
def load_corpus(self, selection):
if self.model.selected_corpus != selection:
self.status['text'] = 'Loading ' + selection + '...'
self.freeze_editable()
self.model.load_corpus(selection)
def freeze_editable(self):
self.prev['state'] = 'disabled'
self.next['state'] = 'disabled'
def clear_results_box(self):
self.results_box['state'] = 'normal'
self.results_box.delete("1.0", END)
self.results_box['state'] = 'disabled'
def fire_event(self, event):
# Firing an event so that rendering of widgets happen in the mainloop thread
self.top.event_generate(event, when='tail')
def destroy(self, *e):
if self.top is None:
return
self.top.after_cancel(self.after)
self.top.destroy()
self.top = None
def mainloop(self, *args, **kwargs):
if in_idle():
return
self.top.mainloop(*args, **kwargs)
def unfreeze_editable(self):
self.set_paging_button_states()
def set_paging_button_states(self):
if self.current_page == -1 or self.current_page == 0:
self.prev['state'] = 'disabled'
else:
self.prev['state'] = 'normal'
if self.model.is_last_page(self.current_page):
self.next['state'] = 'disabled'
else:
self.next['state'] = 'normal'
def write_results(self, results):
self.results_box['state'] = 'normal'
row = 1
for each in results:
self.results_box.insert(str(row) + '.0', each[0] + " " + each[1] + "\n")
row += 1
self.results_box['state'] = 'disabled'
class CollocationsModel:
def __init__(self, queue):
self.result_count = None
self.selected_corpus = None
self.collocations = None
self.CORPORA = _CORPORA
self.DEFAULT_CORPUS = _DEFAULT
self.queue = queue
self.reset_results()
def reset_results(self):
self.result_pages = []
self.results_returned = 0
def load_corpus(self, name):
self.selected_corpus = name
self.collocations = None
runner_thread = self.LoadCorpus(name, self)
runner_thread.start()
self.reset_results()
def non_default_corpora(self):
copy = []
copy.extend(list(self.CORPORA.keys()))
copy.remove(self.DEFAULT_CORPUS)
copy.sort()
return copy
def is_last_page(self, number):
if number < len(self.result_pages):
return False
return self.results_returned + (
number - len(self.result_pages)
) * self.result_count >= len(self.collocations)
def next(self, page):
if (len(self.result_pages) - 1) < page:
for i in range(page - (len(self.result_pages) - 1)):
self.result_pages.append(
self.collocations[
self.results_returned : self.results_returned
+ self.result_count
]
)
self.results_returned += self.result_count
return self.result_pages[page]
def prev(self, page):
if page == -1:
return []
return self.result_pages[page]
class LoadCorpus(threading.Thread):
def __init__(self, name, model):
threading.Thread.__init__(self)
self.model, self.name = model, name
def run(self):
try:
words = self.model.CORPORA[self.name]()
from operator import itemgetter
text = [w for w in words if len(w) > 2]
fd = FreqDist(tuple(text[i : i + 2]) for i in range(len(text) - 1))
vocab = FreqDist(text)
scored = [
((w1, w2), fd[(w1, w2)] ** 3 / (vocab[w1] * vocab[w2]))
for w1, w2 in fd
]
scored.sort(key=itemgetter(1), reverse=True)
self.model.collocations = list(map(itemgetter(0), scored))
self.model.queue.put(CORPUS_LOADED_EVENT)
except Exception as e:
print(e)
self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)
# def collocations():
# colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]]
def app():
c = CollocationsView()
c.mainloop()
if __name__ == '__main__':
app()
__all__ = ['app']