semcor.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. # Natural Language Toolkit: SemCor Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Nathan Schneider <nschneid@cs.cmu.edu>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. Corpus reader for the SemCor Corpus.
  9. """
  10. __docformat__ = "epytext en"
  11. from nltk.corpus.reader.api import *
  12. from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
  13. from nltk.tree import Tree
  14. class SemcorCorpusReader(XMLCorpusReader):
  15. """
  16. Corpus reader for the SemCor Corpus.
  17. For access to the complete XML data structure, use the ``xml()``
  18. method. For access to simple word lists and tagged word lists, use
  19. ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
  20. """
  21. def __init__(self, root, fileids, wordnet, lazy=True):
  22. XMLCorpusReader.__init__(self, root, fileids)
  23. self._lazy = lazy
  24. self._wordnet = wordnet
  25. def words(self, fileids=None):
  26. """
  27. :return: the given file(s) as a list of words and punctuation symbols.
  28. :rtype: list(str)
  29. """
  30. return self._items(fileids, "word", False, False, False)
  31. def chunks(self, fileids=None):
  32. """
  33. :return: the given file(s) as a list of chunks,
  34. each of which is a list of words and punctuation symbols
  35. that form a unit.
  36. :rtype: list(list(str))
  37. """
  38. return self._items(fileids, "chunk", False, False, False)
  39. def tagged_chunks(self, fileids=None, tag=("pos" or "sem" or "both")):
  40. """
  41. :return: the given file(s) as a list of tagged chunks, represented
  42. in tree form.
  43. :rtype: list(Tree)
  44. :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
  45. to indicate the kind of tags to include. Semantic tags consist of
  46. WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
  47. without a specific entry in WordNet. (Named entities of type 'other'
  48. have no lemma. Other chunks not in WordNet have no semantic tag.
  49. Punctuation tokens have `None` for their part of speech tag.)
  50. """
  51. return self._items(fileids, "chunk", False, tag != "sem", tag != "pos")
  52. def sents(self, fileids=None):
  53. """
  54. :return: the given file(s) as a list of sentences, each encoded
  55. as a list of word strings.
  56. :rtype: list(list(str))
  57. """
  58. return self._items(fileids, "word", True, False, False)
  59. def chunk_sents(self, fileids=None):
  60. """
  61. :return: the given file(s) as a list of sentences, each encoded
  62. as a list of chunks.
  63. :rtype: list(list(list(str)))
  64. """
  65. return self._items(fileids, "chunk", True, False, False)
  66. def tagged_sents(self, fileids=None, tag=("pos" or "sem" or "both")):
  67. """
  68. :return: the given file(s) as a list of sentences. Each sentence
  69. is represented as a list of tagged chunks (in tree form).
  70. :rtype: list(list(Tree))
  71. :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
  72. to indicate the kind of tags to include. Semantic tags consist of
  73. WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
  74. without a specific entry in WordNet. (Named entities of type 'other'
  75. have no lemma. Other chunks not in WordNet have no semantic tag.
  76. Punctuation tokens have `None` for their part of speech tag.)
  77. """
  78. return self._items(fileids, "chunk", True, tag != "sem", tag != "pos")
  79. def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):
  80. if unit == "word" and not bracket_sent:
  81. # the result of the SemcorWordView may be a multiword unit, so the
  82. # LazyConcatenation will make sure the sentence is flattened
  83. _ = lambda *args: LazyConcatenation(
  84. (SemcorWordView if self._lazy else self._words)(*args)
  85. )
  86. else:
  87. _ = SemcorWordView if self._lazy else self._words
  88. return concat(
  89. [
  90. _(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet)
  91. for fileid in self.abspaths(fileids)
  92. ]
  93. )
  94. def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
  95. """
  96. Helper used to implement the view methods -- returns a list of
  97. tokens, (segmented) words, chunks, or sentences. The tokens
  98. and chunks may optionally be tagged (with POS and sense
  99. information).
  100. :param fileid: The name of the underlying file.
  101. :param unit: One of `'token'`, `'word'`, or `'chunk'`.
  102. :param bracket_sent: If true, include sentence bracketing.
  103. :param pos_tag: Whether to include part-of-speech tags.
  104. :param sem_tag: Whether to include semantic tags, namely WordNet lemma
  105. and OOV named entity status.
  106. """
  107. assert unit in ("token", "word", "chunk")
  108. result = []
  109. xmldoc = ElementTree.parse(fileid).getroot()
  110. for xmlsent in xmldoc.findall(".//s"):
  111. sent = []
  112. for xmlword in _all_xmlwords_in(xmlsent):
  113. itm = SemcorCorpusReader._word(
  114. xmlword, unit, pos_tag, sem_tag, self._wordnet
  115. )
  116. if unit == "word":
  117. sent.extend(itm)
  118. else:
  119. sent.append(itm)
  120. if bracket_sent:
  121. result.append(SemcorSentence(xmlsent.attrib["snum"], sent))
  122. else:
  123. result.extend(sent)
  124. assert None not in result
  125. return result
  126. @staticmethod
  127. def _word(xmlword, unit, pos_tag, sem_tag, wordnet):
  128. tkn = xmlword.text
  129. if not tkn:
  130. tkn = "" # fixes issue 337?
  131. lemma = xmlword.get("lemma", tkn) # lemma or NE class
  132. lexsn = xmlword.get("lexsn") # lex_sense (locator for the lemma's sense)
  133. if lexsn is not None:
  134. sense_key = lemma + "%" + lexsn
  135. wnpos = ("n", "v", "a", "r", "s")[
  136. int(lexsn.split(":")[0]) - 1
  137. ] # see http://wordnet.princeton.edu/man/senseidx.5WN.html
  138. else:
  139. sense_key = wnpos = None
  140. redef = xmlword.get(
  141. "rdf", tkn
  142. ) # redefinition--this indicates the lookup string
  143. # does not exactly match the enclosed string, e.g. due to typographical adjustments
  144. # or discontinuity of a multiword expression. If a redefinition has occurred,
  145. # the "rdf" attribute holds its inflected form and "lemma" holds its lemma.
  146. # For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
  147. sensenum = xmlword.get("wnsn") # WordNet sense number
  148. isOOVEntity = "pn" in xmlword.keys() # a "personal name" (NE) not in WordNet
  149. pos = xmlword.get(
  150. "pos"
  151. ) # part of speech for the whole chunk (None for punctuation)
  152. if unit == "token":
  153. if not pos_tag and not sem_tag:
  154. itm = tkn
  155. else:
  156. itm = (
  157. (tkn,)
  158. + ((pos,) if pos_tag else ())
  159. + ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ())
  160. )
  161. return itm
  162. else:
  163. ww = tkn.split("_") # TODO: case where punctuation intervenes in MWE
  164. if unit == "word":
  165. return ww
  166. else:
  167. if sensenum is not None:
  168. try:
  169. sense = wordnet.lemma_from_key(sense_key) # Lemma object
  170. except Exception:
  171. # cannot retrieve the wordnet.Lemma object. possible reasons:
  172. # (a) the wordnet corpus is not downloaded;
  173. # (b) a nonexistant sense is annotated: e.g., such.s.00 triggers:
  174. # nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'
  175. # solution: just use the lemma name as a string
  176. try:
  177. sense = "%s.%s.%02d" % (
  178. lemma,
  179. wnpos,
  180. int(sensenum),
  181. ) # e.g.: reach.v.02
  182. except ValueError:
  183. sense = (
  184. lemma + "." + wnpos + "." + sensenum
  185. ) # e.g. the sense number may be "2;1"
  186. bottom = [Tree(pos, ww)] if pos_tag else ww
  187. if sem_tag and isOOVEntity:
  188. if sensenum is not None:
  189. return Tree(sense, [Tree("NE", bottom)])
  190. else: # 'other' NE
  191. return Tree("NE", bottom)
  192. elif sem_tag and sensenum is not None:
  193. return Tree(sense, bottom)
  194. elif pos_tag:
  195. return bottom[0]
  196. else:
  197. return bottom # chunk as a list
  198. def _all_xmlwords_in(elt, result=None):
  199. if result is None:
  200. result = []
  201. for child in elt:
  202. if child.tag in ("wf", "punc"):
  203. result.append(child)
  204. else:
  205. _all_xmlwords_in(child, result)
  206. return result
  207. class SemcorSentence(list):
  208. """
  209. A list of words, augmented by an attribute ``num`` used to record
  210. the sentence identifier (the ``n`` attribute from the XML).
  211. """
  212. def __init__(self, num, items):
  213. self.num = num
  214. list.__init__(self, items)
  215. class SemcorWordView(XMLCorpusView):
  216. """
  217. A stream backed corpus view specialized for use with the BNC corpus.
  218. """
  219. def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet):
  220. """
  221. :param fileid: The name of the underlying file.
  222. :param unit: One of `'token'`, `'word'`, or `'chunk'`.
  223. :param bracket_sent: If true, include sentence bracketing.
  224. :param pos_tag: Whether to include part-of-speech tags.
  225. :param sem_tag: Whether to include semantic tags, namely WordNet lemma
  226. and OOV named entity status.
  227. """
  228. if bracket_sent:
  229. tagspec = ".*/s"
  230. else:
  231. tagspec = ".*/s/(punc|wf)"
  232. self._unit = unit
  233. self._sent = bracket_sent
  234. self._pos_tag = pos_tag
  235. self._sem_tag = sem_tag
  236. self._wordnet = wordnet
  237. XMLCorpusView.__init__(self, fileid, tagspec)
  238. def handle_elt(self, elt, context):
  239. if self._sent:
  240. return self.handle_sent(elt)
  241. else:
  242. return self.handle_word(elt)
  243. def handle_word(self, elt):
  244. return SemcorCorpusReader._word(
  245. elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet
  246. )
  247. def handle_sent(self, elt):
  248. sent = []
  249. for child in elt:
  250. if child.tag in ("wf", "punc"):
  251. itm = self.handle_word(child)
  252. if self._unit == "word":
  253. sent.extend(itm)
  254. else:
  255. sent.append(itm)
  256. else:
  257. raise ValueError("Unexpected element %s" % child.tag)
  258. return SemcorSentence(elt.attrib["snum"], sent)