You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
66 lines
1.7 KiB
Python
66 lines
1.7 KiB
Python
# Natural Language Toolkit: Dispersion Plots
|
|
#
|
|
# Copyright (C) 2001-2020 NLTK Project
|
|
# Author: Steven Bird <stevenbird1@gmail.com>
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
|
|
"""
|
|
A utility for displaying lexical dispersion.
|
|
"""
|
|
|
|
|
|
def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Plot"):
|
|
"""
|
|
Generate a lexical dispersion plot.
|
|
|
|
:param text: The source text
|
|
:type text: list(str) or enum(str)
|
|
:param words: The target words
|
|
:type words: list of str
|
|
:param ignore_case: flag to set if case should be ignored when searching text
|
|
:type ignore_case: bool
|
|
"""
|
|
|
|
try:
|
|
from matplotlib import pylab
|
|
except ImportError:
|
|
raise ValueError(
|
|
"The plot function requires matplotlib to be installed."
|
|
"See http://matplotlib.org/"
|
|
)
|
|
|
|
text = list(text)
|
|
words.reverse()
|
|
|
|
if ignore_case:
|
|
words_to_comp = list(map(str.lower, words))
|
|
text_to_comp = list(map(str.lower, text))
|
|
else:
|
|
words_to_comp = words
|
|
text_to_comp = text
|
|
|
|
points = [
|
|
(x, y)
|
|
for x in range(len(text_to_comp))
|
|
for y in range(len(words_to_comp))
|
|
if text_to_comp[x] == words_to_comp[y]
|
|
]
|
|
if points:
|
|
x, y = list(zip(*points))
|
|
else:
|
|
x = y = ()
|
|
pylab.plot(x, y, "b|", scalex=0.1)
|
|
pylab.yticks(list(range(len(words))), words, color="b")
|
|
pylab.ylim(-1, len(words))
|
|
pylab.title(title)
|
|
pylab.xlabel("Word Offset")
|
|
pylab.show()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from nltk.corpus import gutenberg
|
|
|
|
words = ["Elinor", "Marianne", "Edward", "Willoughby"]
|
|
dispersion_plot(gutenberg.words("austen-sense.txt"), words)
|