| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465 |
- # Natural Language Toolkit: Dispersion Plots
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Steven Bird <stevenbird1@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- A utility for displaying lexical dispersion.
- """
- def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Plot"):
- """
- Generate a lexical dispersion plot.
- :param text: The source text
- :type text: list(str) or enum(str)
- :param words: The target words
- :type words: list of str
- :param ignore_case: flag to set if case should be ignored when searching text
- :type ignore_case: bool
- """
- try:
- from matplotlib import pylab
- except ImportError:
- raise ValueError(
- "The plot function requires matplotlib to be installed."
- "See http://matplotlib.org/"
- )
- text = list(text)
- words.reverse()
- if ignore_case:
- words_to_comp = list(map(str.lower, words))
- text_to_comp = list(map(str.lower, text))
- else:
- words_to_comp = words
- text_to_comp = text
- points = [
- (x, y)
- for x in range(len(text_to_comp))
- for y in range(len(words_to_comp))
- if text_to_comp[x] == words_to_comp[y]
- ]
- if points:
- x, y = list(zip(*points))
- else:
- x = y = ()
- pylab.plot(x, y, "b|", scalex=0.1)
- pylab.yticks(list(range(len(words))), words, color="b")
- pylab.ylim(-1, len(words))
- pylab.title(title)
- pylab.xlabel("Word Offset")
- pylab.show()
- if __name__ == "__main__":
- from nltk.corpus import gutenberg
- words = ["Elinor", "Marianne", "Edward", "Willoughby"]
- dispersion_plot(gutenberg.words("austen-sense.txt"), words)
|