bnc.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. # Natural Language Toolkit: Plaintext Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """Corpus reader for the XML version of the British National Corpus."""
  8. from nltk.corpus.reader.util import concat
  9. from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView, ElementTree
  10. class BNCCorpusReader(XMLCorpusReader):
  11. """Corpus reader for the XML version of the British National Corpus.
  12. For access to the complete XML data structure, use the ``xml()``
  13. method. For access to simple word lists and tagged word lists, use
  14. ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
  15. You can obtain the full version of the BNC corpus at
  16. http://www.ota.ox.ac.uk/desc/2554
  17. If you extracted the archive to a directory called `BNC`, then you can
  18. instantiate the reader as::
  19. BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
  20. """
  21. def __init__(self, root, fileids, lazy=True):
  22. XMLCorpusReader.__init__(self, root, fileids)
  23. self._lazy = lazy
  24. def words(self, fileids=None, strip_space=True, stem=False):
  25. """
  26. :return: the given file(s) as a list of words
  27. and punctuation symbols.
  28. :rtype: list(str)
  29. :param strip_space: If true, then strip trailing spaces from
  30. word tokens. Otherwise, leave the spaces on the tokens.
  31. :param stem: If true, then use word stems instead of word strings.
  32. """
  33. return self._views(fileids, False, None, strip_space, stem)
  34. def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False):
  35. """
  36. :return: the given file(s) as a list of tagged
  37. words and punctuation symbols, encoded as tuples
  38. ``(word,tag)``.
  39. :rtype: list(tuple(str,str))
  40. :param c5: If true, then the tags used will be the more detailed
  41. c5 tags. Otherwise, the simplified tags will be used.
  42. :param strip_space: If true, then strip trailing spaces from
  43. word tokens. Otherwise, leave the spaces on the tokens.
  44. :param stem: If true, then use word stems instead of word strings.
  45. """
  46. tag = "c5" if c5 else "pos"
  47. return self._views(fileids, False, tag, strip_space, stem)
  48. def sents(self, fileids=None, strip_space=True, stem=False):
  49. """
  50. :return: the given file(s) as a list of
  51. sentences or utterances, each encoded as a list of word
  52. strings.
  53. :rtype: list(list(str))
  54. :param strip_space: If true, then strip trailing spaces from
  55. word tokens. Otherwise, leave the spaces on the tokens.
  56. :param stem: If true, then use word stems instead of word strings.
  57. """
  58. return self._views(fileids, True, None, strip_space, stem)
  59. def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False):
  60. """
  61. :return: the given file(s) as a list of
  62. sentences, each encoded as a list of ``(word,tag)`` tuples.
  63. :rtype: list(list(tuple(str,str)))
  64. :param c5: If true, then the tags used will be the more detailed
  65. c5 tags. Otherwise, the simplified tags will be used.
  66. :param strip_space: If true, then strip trailing spaces from
  67. word tokens. Otherwise, leave the spaces on the tokens.
  68. :param stem: If true, then use word stems instead of word strings.
  69. """
  70. tag = "c5" if c5 else "pos"
  71. return self._views(
  72. fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem
  73. )
  74. def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False):
  75. """A helper function that instantiates BNCWordViews or the list of words/sentences."""
  76. f = BNCWordView if self._lazy else self._words
  77. return concat(
  78. [
  79. f(fileid, sent, tag, strip_space, stem)
  80. for fileid in self.abspaths(fileids)
  81. ]
  82. )
  83. def _words(self, fileid, bracket_sent, tag, strip_space, stem):
  84. """
  85. Helper used to implement the view methods -- returns a list of
  86. words or a list of sentences, optionally tagged.
  87. :param fileid: The name of the underlying file.
  88. :param bracket_sent: If true, include sentence bracketing.
  89. :param tag: The name of the tagset to use, or None for no tags.
  90. :param strip_space: If true, strip spaces from word tokens.
  91. :param stem: If true, then substitute stems for words.
  92. """
  93. result = []
  94. xmldoc = ElementTree.parse(fileid).getroot()
  95. for xmlsent in xmldoc.findall(".//s"):
  96. sent = []
  97. for xmlword in _all_xmlwords_in(xmlsent):
  98. word = xmlword.text
  99. if not word:
  100. word = "" # fixes issue 337?
  101. if strip_space or stem:
  102. word = word.strip()
  103. if stem:
  104. word = xmlword.get("hw", word)
  105. if tag == "c5":
  106. word = (word, xmlword.get("c5"))
  107. elif tag == "pos":
  108. word = (word, xmlword.get("pos", xmlword.get("c5")))
  109. sent.append(word)
  110. if bracket_sent:
  111. result.append(BNCSentence(xmlsent.attrib["n"], sent))
  112. else:
  113. result.extend(sent)
  114. assert None not in result
  115. return result
  116. def _all_xmlwords_in(elt, result=None):
  117. if result is None:
  118. result = []
  119. for child in elt:
  120. if child.tag in ("c", "w"):
  121. result.append(child)
  122. else:
  123. _all_xmlwords_in(child, result)
  124. return result
  125. class BNCSentence(list):
  126. """
  127. A list of words, augmented by an attribute ``num`` used to record
  128. the sentence identifier (the ``n`` attribute from the XML).
  129. """
  130. def __init__(self, num, items):
  131. self.num = num
  132. list.__init__(self, items)
  133. class BNCWordView(XMLCorpusView):
  134. """
  135. A stream backed corpus view specialized for use with the BNC corpus.
  136. """
  137. tags_to_ignore = set(
  138. ["pb", "gap", "vocal", "event", "unclear", "shift", "pause", "align"]
  139. )
  140. """These tags are ignored. For their description refer to the
  141. technical documentation, for example,
  142. http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html
  143. """
  144. def __init__(self, fileid, sent, tag, strip_space, stem):
  145. """
  146. :param fileid: The name of the underlying file.
  147. :param sent: If true, include sentence bracketing.
  148. :param tag: The name of the tagset to use, or None for no tags.
  149. :param strip_space: If true, strip spaces from word tokens.
  150. :param stem: If true, then substitute stems for words.
  151. """
  152. if sent:
  153. tagspec = ".*/s"
  154. else:
  155. tagspec = ".*/s/(.*/)?(c|w)"
  156. self._sent = sent
  157. self._tag = tag
  158. self._strip_space = strip_space
  159. self._stem = stem
  160. self.title = None #: Title of the document.
  161. self.author = None #: Author of the document.
  162. self.editor = None #: Editor
  163. self.resps = None #: Statement of responsibility
  164. XMLCorpusView.__init__(self, fileid, tagspec)
  165. # Read in a tasty header.
  166. self._open()
  167. self.read_block(self._stream, ".*/teiHeader$", self.handle_header)
  168. self.close()
  169. # Reset tag context.
  170. self._tag_context = {0: ()}
  171. def handle_header(self, elt, context):
  172. # Set up some metadata!
  173. titles = elt.findall("titleStmt/title")
  174. if titles:
  175. self.title = "\n".join(title.text.strip() for title in titles)
  176. authors = elt.findall("titleStmt/author")
  177. if authors:
  178. self.author = "\n".join(author.text.strip() for author in authors)
  179. editors = elt.findall("titleStmt/editor")
  180. if editors:
  181. self.editor = "\n".join(editor.text.strip() for editor in editors)
  182. resps = elt.findall("titleStmt/respStmt")
  183. if resps:
  184. self.resps = "\n\n".join(
  185. "\n".join(resp_elt.text.strip() for resp_elt in resp) for resp in resps
  186. )
  187. def handle_elt(self, elt, context):
  188. if self._sent:
  189. return self.handle_sent(elt)
  190. else:
  191. return self.handle_word(elt)
  192. def handle_word(self, elt):
  193. word = elt.text
  194. if not word:
  195. word = "" # fixes issue 337?
  196. if self._strip_space or self._stem:
  197. word = word.strip()
  198. if self._stem:
  199. word = elt.get("hw", word)
  200. if self._tag == "c5":
  201. word = (word, elt.get("c5"))
  202. elif self._tag == "pos":
  203. word = (word, elt.get("pos", elt.get("c5")))
  204. return word
  205. def handle_sent(self, elt):
  206. sent = []
  207. for child in elt:
  208. if child.tag in ("mw", "hi", "corr", "trunc"):
  209. sent += [self.handle_word(w) for w in child]
  210. elif child.tag in ("w", "c"):
  211. sent.append(self.handle_word(child))
  212. elif child.tag not in self.tags_to_ignore:
  213. raise ValueError("Unexpected element %s" % child.tag)
  214. return BNCSentence(elt.attrib["n"], sent)