switchboard.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. # Natural Language Toolkit: Switchboard Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. import re
  8. from nltk.tag import str2tuple, map_tag
  9. from nltk.corpus.reader.util import *
  10. from nltk.corpus.reader.api import *
  11. class SwitchboardTurn(list):
  12. """
  13. A specialized list object used to encode switchboard utterances.
  14. The elements of the list are the words in the utterance; and two
  15. attributes, ``speaker`` and ``id``, are provided to retrieve the
  16. spearker identifier and utterance id. Note that utterance ids
  17. are only unique within a given discourse.
  18. """
  19. def __init__(self, words, speaker, id):
  20. list.__init__(self, words)
  21. self.speaker = speaker
  22. self.id = int(id)
  23. def __repr__(self):
  24. if len(self) == 0:
  25. text = ""
  26. elif isinstance(self[0], tuple):
  27. text = " ".join("%s/%s" % w for w in self)
  28. else:
  29. text = " ".join(self)
  30. return "<%s.%s: %r>" % (self.speaker, self.id, text)
  31. class SwitchboardCorpusReader(CorpusReader):
  32. _FILES = ["tagged"]
  33. # Use the "tagged" file even for non-tagged data methods, since
  34. # it's tokenized.
  35. def __init__(self, root, tagset=None):
  36. CorpusReader.__init__(self, root, self._FILES)
  37. self._tagset = tagset
  38. def words(self):
  39. return StreamBackedCorpusView(self.abspath("tagged"), self._words_block_reader)
  40. def tagged_words(self, tagset=None):
  41. def tagged_words_block_reader(stream):
  42. return self._tagged_words_block_reader(stream, tagset)
  43. return StreamBackedCorpusView(self.abspath("tagged"), tagged_words_block_reader)
  44. def turns(self):
  45. return StreamBackedCorpusView(self.abspath("tagged"), self._turns_block_reader)
  46. def tagged_turns(self, tagset=None):
  47. def tagged_turns_block_reader(stream):
  48. return self._tagged_turns_block_reader(stream, tagset)
  49. return StreamBackedCorpusView(self.abspath("tagged"), tagged_turns_block_reader)
  50. def discourses(self):
  51. return StreamBackedCorpusView(
  52. self.abspath("tagged"), self._discourses_block_reader
  53. )
  54. def tagged_discourses(self, tagset=False):
  55. def tagged_discourses_block_reader(stream):
  56. return self._tagged_discourses_block_reader(stream, tagset)
  57. return StreamBackedCorpusView(
  58. self.abspath("tagged"), tagged_discourses_block_reader
  59. )
  60. def _discourses_block_reader(self, stream):
  61. # returns at most 1 discourse. (The other methods depend on this.)
  62. return [
  63. [
  64. self._parse_utterance(u, include_tag=False)
  65. for b in read_blankline_block(stream)
  66. for u in b.split("\n")
  67. if u.strip()
  68. ]
  69. ]
  70. def _tagged_discourses_block_reader(self, stream, tagset=None):
  71. # returns at most 1 discourse. (The other methods depend on this.)
  72. return [
  73. [
  74. self._parse_utterance(u, include_tag=True, tagset=tagset)
  75. for b in read_blankline_block(stream)
  76. for u in b.split("\n")
  77. if u.strip()
  78. ]
  79. ]
  80. def _turns_block_reader(self, stream):
  81. return self._discourses_block_reader(stream)[0]
  82. def _tagged_turns_block_reader(self, stream, tagset=None):
  83. return self._tagged_discourses_block_reader(stream, tagset)[0]
  84. def _words_block_reader(self, stream):
  85. return sum(self._discourses_block_reader(stream)[0], [])
  86. def _tagged_words_block_reader(self, stream, tagset=None):
  87. return sum(self._tagged_discourses_block_reader(stream, tagset)[0], [])
  88. _UTTERANCE_RE = re.compile("(\w+)\.(\d+)\:\s*(.*)")
  89. _SEP = "/"
  90. def _parse_utterance(self, utterance, include_tag, tagset=None):
  91. m = self._UTTERANCE_RE.match(utterance)
  92. if m is None:
  93. raise ValueError("Bad utterance %r" % utterance)
  94. speaker, id, text = m.groups()
  95. words = [str2tuple(s, self._SEP) for s in text.split()]
  96. if not include_tag:
  97. words = [w for (w, t) in words]
  98. elif tagset and tagset != self._tagset:
  99. words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words]
  100. return SwitchboardTurn(words, speaker, id)