You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
67 lines
1.7 KiB
Python
67 lines
1.7 KiB
Python
5 years ago
|
# Natural Language Toolkit: Dispersion Plots
|
||
|
#
|
||
|
# Copyright (C) 2001-2019 NLTK Project
|
||
|
# Author: Steven Bird <stevenbird1@gmail.com>
|
||
|
# URL: <http://nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
"""
|
||
|
A utility for displaying lexical dispersion.
|
||
|
"""
|
||
|
|
||
|
|
||
|
def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Plot"):
|
||
|
"""
|
||
|
Generate a lexical dispersion plot.
|
||
|
|
||
|
:param text: The source text
|
||
|
:type text: list(str) or enum(str)
|
||
|
:param words: The target words
|
||
|
:type words: list of str
|
||
|
:param ignore_case: flag to set if case should be ignored when searching text
|
||
|
:type ignore_case: bool
|
||
|
"""
|
||
|
|
||
|
try:
|
||
|
from matplotlib import pylab
|
||
|
except ImportError:
|
||
|
raise ValueError(
|
||
|
'The plot function requires matplotlib to be installed.'
|
||
|
'See http://matplotlib.org/'
|
||
|
)
|
||
|
|
||
|
text = list(text)
|
||
|
words.reverse()
|
||
|
|
||
|
if ignore_case:
|
||
|
words_to_comp = list(map(str.lower, words))
|
||
|
text_to_comp = list(map(str.lower, text))
|
||
|
else:
|
||
|
words_to_comp = words
|
||
|
text_to_comp = text
|
||
|
|
||
|
points = [
|
||
|
(x, y)
|
||
|
for x in range(len(text_to_comp))
|
||
|
for y in range(len(words_to_comp))
|
||
|
if text_to_comp[x] == words_to_comp[y]
|
||
|
]
|
||
|
if points:
|
||
|
x, y = list(zip(*points))
|
||
|
else:
|
||
|
x = y = ()
|
||
|
pylab.plot(x, y, "b|", scalex=0.1)
|
||
|
pylab.yticks(list(range(len(words))), words, color="b")
|
||
|
pylab.ylim(-1, len(words))
|
||
|
pylab.title(title)
|
||
|
pylab.xlabel("Word Offset")
|
||
|
pylab.show()
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
import nltk.compat
|
||
|
from nltk.corpus import gutenberg
|
||
|
|
||
|
words = ['Elinor', 'Marianne', 'Edward', 'Willoughby']
|
||
|
dispersion_plot(gutenberg.words('austen-sense.txt'), words)
|