You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

66 lines
1.7 KiB
Python

5 years ago
# Natural Language Toolkit: Dispersion Plots
#
# Copyright (C) 2001-2020 NLTK Project
5 years ago
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
A utility for displaying lexical dispersion.
"""
def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Plot"):
"""
Generate a lexical dispersion plot.
:param text: The source text
:type text: list(str) or enum(str)
:param words: The target words
:type words: list of str
:param ignore_case: flag to set if case should be ignored when searching text
:type ignore_case: bool
"""
try:
from matplotlib import pylab
except ImportError:
raise ValueError(
"The plot function requires matplotlib to be installed."
"See http://matplotlib.org/"
5 years ago
)
text = list(text)
words.reverse()
if ignore_case:
words_to_comp = list(map(str.lower, words))
text_to_comp = list(map(str.lower, text))
else:
words_to_comp = words
text_to_comp = text
points = [
(x, y)
for x in range(len(text_to_comp))
for y in range(len(words_to_comp))
if text_to_comp[x] == words_to_comp[y]
]
if points:
x, y = list(zip(*points))
else:
x = y = ()
pylab.plot(x, y, "b|", scalex=0.1)
pylab.yticks(list(range(len(words))), words, color="b")
pylab.ylim(-1, len(words))
pylab.title(title)
pylab.xlabel("Word Offset")
pylab.show()
if __name__ == "__main__":
5 years ago
from nltk.corpus import gutenberg
words = ["Elinor", "Marianne", "Edward", "Willoughby"]
dispersion_plot(gutenberg.words("austen-sense.txt"), words)