nps_chat.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. # Natural Language Toolkit: NPS Chat Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. import re
  8. import textwrap
  9. from nltk.util import LazyConcatenation
  10. from nltk.internals import ElementWrapper
  11. from nltk.tag import map_tag
  12. from nltk.corpus.reader.util import *
  13. from nltk.corpus.reader.api import *
  14. from nltk.corpus.reader.xmldocs import *
  15. class NPSChatCorpusReader(XMLCorpusReader):
  16. def __init__(self, root, fileids, wrap_etree=False, tagset=None):
  17. XMLCorpusReader.__init__(self, root, fileids, wrap_etree)
  18. self._tagset = tagset
  19. def xml_posts(self, fileids=None):
  20. if self._wrap_etree:
  21. return concat(
  22. [
  23. XMLCorpusView(fileid, "Session/Posts/Post", self._wrap_elt)
  24. for fileid in self.abspaths(fileids)
  25. ]
  26. )
  27. else:
  28. return concat(
  29. [
  30. XMLCorpusView(fileid, "Session/Posts/Post")
  31. for fileid in self.abspaths(fileids)
  32. ]
  33. )
  34. def posts(self, fileids=None):
  35. return concat(
  36. [
  37. XMLCorpusView(
  38. fileid, "Session/Posts/Post/terminals", self._elt_to_words
  39. )
  40. for fileid in self.abspaths(fileids)
  41. ]
  42. )
  43. def tagged_posts(self, fileids=None, tagset=None):
  44. def reader(elt, handler):
  45. return self._elt_to_tagged_words(elt, handler, tagset)
  46. return concat(
  47. [
  48. XMLCorpusView(fileid, "Session/Posts/Post/terminals", reader)
  49. for fileid in self.abspaths(fileids)
  50. ]
  51. )
  52. def words(self, fileids=None):
  53. return LazyConcatenation(self.posts(fileids))
  54. def tagged_words(self, fileids=None, tagset=None):
  55. return LazyConcatenation(self.tagged_posts(fileids, tagset))
  56. def _wrap_elt(self, elt, handler):
  57. return ElementWrapper(elt)
  58. def _elt_to_words(self, elt, handler):
  59. return [self._simplify_username(t.attrib["word"]) for t in elt.findall("t")]
  60. def _elt_to_tagged_words(self, elt, handler, tagset=None):
  61. tagged_post = [
  62. (self._simplify_username(t.attrib["word"]), t.attrib["pos"])
  63. for t in elt.findall("t")
  64. ]
  65. if tagset and tagset != self._tagset:
  66. tagged_post = [
  67. (w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_post
  68. ]
  69. return tagged_post
  70. @staticmethod
  71. def _simplify_username(word):
  72. if "User" in word:
  73. word = "U" + word.split("User", 1)[1]
  74. elif isinstance(word, bytes):
  75. word = word.decode("ascii")
  76. return word