bracket_parse.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. # Natural Language Toolkit: Penn Treebank Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Steven Bird <stevenbird1@gmail.com>
  5. # Edward Loper <edloper@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. Corpus reader for corpora that consist of parenthesis-delineated parse trees.
  10. """
  11. import sys
  12. from nltk.tree import Tree
  13. from nltk.tag import map_tag
  14. from nltk.corpus.reader.util import *
  15. from nltk.corpus.reader.api import *
  16. # we use [^\s()]+ instead of \S+? to avoid matching ()
  17. SORTTAGWRD = re.compile(r"\((\d+) ([^\s()]+) ([^\s()]+)\)")
  18. TAGWORD = re.compile(r"\(([^\s()]+) ([^\s()]+)\)")
  19. WORD = re.compile(r"\([^\s()]+ ([^\s()]+)\)")
  20. EMPTY_BRACKETS = re.compile(r"\s*\(\s*\(")
  21. class BracketParseCorpusReader(SyntaxCorpusReader):
  22. """
  23. Reader for corpora that consist of parenthesis-delineated parse trees,
  24. like those found in the "combined" section of the Penn Treebank,
  25. e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))".
  26. """
  27. def __init__(
  28. self,
  29. root,
  30. fileids,
  31. comment_char=None,
  32. detect_blocks="unindented_paren",
  33. encoding="utf8",
  34. tagset=None,
  35. ):
  36. """
  37. :param root: The root directory for this corpus.
  38. :param fileids: A list or regexp specifying the fileids in this corpus.
  39. :param comment_char: The character which can appear at the start of
  40. a line to indicate that the rest of the line is a comment.
  41. :param detect_blocks: The method that is used to find blocks
  42. in the corpus; can be 'unindented_paren' (every unindented
  43. parenthesis starts a new parse) or 'sexpr' (brackets are
  44. matched).
  45. :param tagset: The name of the tagset used by this corpus, to be used
  46. for normalizing or converting the POS tags returned by the
  47. tagged_...() methods.
  48. """
  49. # FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
  50. # from CorpusReader?
  51. CorpusReader.__init__(self, root, fileids, encoding)
  52. self._comment_char = comment_char
  53. self._detect_blocks = detect_blocks
  54. self._tagset = tagset
  55. def _read_block(self, stream):
  56. if self._detect_blocks == "sexpr":
  57. return read_sexpr_block(stream, comment_char=self._comment_char)
  58. elif self._detect_blocks == "blankline":
  59. return read_blankline_block(stream)
  60. elif self._detect_blocks == "unindented_paren":
  61. # Tokens start with unindented left parens.
  62. toks = read_regexp_block(stream, start_re=r"^\(")
  63. # Strip any comments out of the tokens.
  64. if self._comment_char:
  65. toks = [
  66. re.sub("(?m)^%s.*" % re.escape(self._comment_char), "", tok)
  67. for tok in toks
  68. ]
  69. return toks
  70. else:
  71. assert 0, "bad block type"
  72. def _normalize(self, t):
  73. # Replace leaves of the form (!), (,), with (! !), (, ,)
  74. t = re.sub(r"\((.)\)", r"(\1 \1)", t)
  75. # Replace leaves of the form (tag word root) with (tag word)
  76. t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t)
  77. return t
  78. def _parse(self, t):
  79. try:
  80. tree = Tree.fromstring(self._normalize(t))
  81. # If there's an empty node at the top, strip it off
  82. if tree.label() == '' and len(tree) == 1:
  83. return tree[0]
  84. else:
  85. return tree
  86. except ValueError as e:
  87. sys.stderr.write("Bad tree detected; trying to recover...\n")
  88. # Try to recover, if we can:
  89. if e.args == ("mismatched parens",):
  90. for n in range(1, 5):
  91. try:
  92. v = Tree(self._normalize(t + ")" * n))
  93. sys.stderr.write(
  94. " Recovered by adding %d close " "paren(s)\n" % n
  95. )
  96. return v
  97. except ValueError:
  98. pass
  99. # Try something else:
  100. sys.stderr.write(" Recovered by returning a flat parse.\n")
  101. # sys.stderr.write(' '.join(t.split())+'\n')
  102. return Tree("S", self._tag(t))
  103. def _tag(self, t, tagset=None):
  104. tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))]
  105. if tagset and tagset != self._tagset:
  106. tagged_sent = [
  107. (w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent
  108. ]
  109. return tagged_sent
  110. def _word(self, t):
  111. return WORD.findall(self._normalize(t))
  112. class CategorizedBracketParseCorpusReader(
  113. CategorizedCorpusReader, BracketParseCorpusReader
  114. ):
  115. """
  116. A reader for parsed corpora whose documents are
  117. divided into categories based on their file identifiers.
  118. @author: Nathan Schneider <nschneid@cs.cmu.edu>
  119. """
  120. def __init__(self, *args, **kwargs):
  121. """
  122. Initialize the corpus reader. Categorization arguments
  123. (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
  124. the L{CategorizedCorpusReader constructor
  125. <CategorizedCorpusReader.__init__>}. The remaining arguments
  126. are passed to the L{BracketParseCorpusReader constructor
  127. <BracketParseCorpusReader.__init__>}.
  128. """
  129. CategorizedCorpusReader.__init__(self, kwargs)
  130. BracketParseCorpusReader.__init__(self, *args, **kwargs)
  131. def _resolve(self, fileids, categories):
  132. if fileids is not None and categories is not None:
  133. raise ValueError("Specify fileids or categories, not both")
  134. if categories is not None:
  135. return self.fileids(categories)
  136. else:
  137. return fileids
  138. def raw(self, fileids=None, categories=None):
  139. return BracketParseCorpusReader.raw(self, self._resolve(fileids, categories))
  140. def words(self, fileids=None, categories=None):
  141. return BracketParseCorpusReader.words(self, self._resolve(fileids, categories))
  142. def sents(self, fileids=None, categories=None):
  143. return BracketParseCorpusReader.sents(self, self._resolve(fileids, categories))
  144. def paras(self, fileids=None, categories=None):
  145. return BracketParseCorpusReader.paras(self, self._resolve(fileids, categories))
  146. def tagged_words(self, fileids=None, categories=None, tagset=None):
  147. return BracketParseCorpusReader.tagged_words(
  148. self, self._resolve(fileids, categories), tagset
  149. )
  150. def tagged_sents(self, fileids=None, categories=None, tagset=None):
  151. return BracketParseCorpusReader.tagged_sents(
  152. self, self._resolve(fileids, categories), tagset
  153. )
  154. def tagged_paras(self, fileids=None, categories=None, tagset=None):
  155. return BracketParseCorpusReader.tagged_paras(
  156. self, self._resolve(fileids, categories), tagset
  157. )
  158. def parsed_words(self, fileids=None, categories=None):
  159. return BracketParseCorpusReader.parsed_words(
  160. self, self._resolve(fileids, categories)
  161. )
  162. def parsed_sents(self, fileids=None, categories=None):
  163. return BracketParseCorpusReader.parsed_sents(
  164. self, self._resolve(fileids, categories)
  165. )
  166. def parsed_paras(self, fileids=None, categories=None):
  167. return BracketParseCorpusReader.parsed_paras(
  168. self, self._resolve(fileids, categories)
  169. )
  170. class AlpinoCorpusReader(BracketParseCorpusReader):
  171. """
  172. Reader for the Alpino Dutch Treebank.
  173. This corpus has a lexical breakdown structure embedded, as read by _parse
  174. Unfortunately this puts punctuation and some other words out of the sentence
  175. order in the xml element tree. This is no good for tag_ and word_
  176. _tag and _word will be overridden to use a non-default new parameter 'ordered'
  177. to the overridden _normalize function. The _parse function can then remain
  178. untouched.
  179. """
  180. def __init__(self, root, encoding="ISO-8859-1", tagset=None):
  181. BracketParseCorpusReader.__init__(
  182. self,
  183. root,
  184. "alpino\.xml",
  185. detect_blocks="blankline",
  186. encoding=encoding,
  187. tagset=tagset,
  188. )
  189. def _normalize(self, t, ordered=False):
  190. """Normalize the xml sentence element in t.
  191. The sentence elements <alpino_ds>, although embedded in a few overall
  192. xml elements, are seperated by blank lines. That's how the reader can
  193. deliver them one at a time.
  194. Each sentence has a few category subnodes that are of no use to us.
  195. The remaining word nodes may or may not appear in the proper order.
  196. Each word node has attributes, among which:
  197. - begin : the position of the word in the sentence
  198. - pos : Part of Speech: the Tag
  199. - word : the actual word
  200. The return value is a string with all xml elementes replaced by
  201. clauses: either a cat clause with nested clauses, or a word clause.
  202. The order of the bracket clauses closely follows the xml.
  203. If ordered == True, the word clauses include an order sequence number.
  204. If ordered == False, the word clauses only have pos and word parts.
  205. """
  206. if t[:10] != "<alpino_ds":
  207. return ""
  208. # convert XML to sexpr notation
  209. t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t)
  210. if ordered:
  211. t = re.sub(
  212. r' <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>',
  213. r"(\1 \2 \3)",
  214. t,
  215. )
  216. else:
  217. t = re.sub(r' <node .*?pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2)", t)
  218. t = re.sub(r" </node>", r")", t)
  219. t = re.sub(r"<sentence>.*</sentence>", r"", t)
  220. t = re.sub(r"</?alpino_ds.*>", r"", t)
  221. return t
  222. def _tag(self, t, tagset=None):
  223. tagged_sent = [
  224. (int(o), w, p)
  225. for (o, p, w) in SORTTAGWRD.findall(self._normalize(t, ordered=True))
  226. ]
  227. tagged_sent.sort()
  228. if tagset and tagset != self._tagset:
  229. tagged_sent = [
  230. (w, map_tag(self._tagset, tagset, p)) for (o, w, p) in tagged_sent
  231. ]
  232. else:
  233. tagged_sent = [(w, p) for (o, w, p) in tagged_sent]
  234. return tagged_sent
  235. def _word(self, t):
  236. """Return a correctly ordered list if words"""
  237. tagged_sent = self._tag(t)
  238. return [w for (w, p) in tagged_sent]