senseval.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. # Natural Language Toolkit: Senseval 2 Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
  5. # Steven Bird <stevenbird1@gmail.com> (modifications)
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. Read from the Senseval 2 Corpus.
  10. SENSEVAL [http://www.senseval.org/]
  11. Evaluation exercises for Word Sense Disambiguation.
  12. Organized by ACL-SIGLEX [http://www.siglex.org/]
  13. Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,
  14. http://www.d.umn.edu/~tpederse/data.html
  15. Distributed with permission.
  16. The NLTK version of the Senseval 2 files uses well-formed XML.
  17. Each instance of the ambiguous words "hard", "interest", "line", and "serve"
  18. is tagged with a sense identifier, and supplied with context.
  19. """
  20. import re
  21. from xml.etree import ElementTree
  22. from nltk.tokenize import *
  23. from nltk.corpus.reader.util import *
  24. from nltk.corpus.reader.api import *
  25. class SensevalInstance(object):
  26. def __init__(self, word, position, context, senses):
  27. self.word = word
  28. self.senses = tuple(senses)
  29. self.position = position
  30. self.context = context
  31. def __repr__(self):
  32. return "SensevalInstance(word=%r, position=%r, " "context=%r, senses=%r)" % (
  33. self.word,
  34. self.position,
  35. self.context,
  36. self.senses,
  37. )
  38. class SensevalCorpusReader(CorpusReader):
  39. def instances(self, fileids=None):
  40. return concat(
  41. [
  42. SensevalCorpusView(fileid, enc)
  43. for (fileid, enc) in self.abspaths(fileids, True)
  44. ]
  45. )
  46. def raw(self, fileids=None):
  47. """
  48. :return: the text contents of the given fileids, as a single string.
  49. """
  50. if fileids is None:
  51. fileids = self._fileids
  52. elif isinstance(fileids, str):
  53. fileids = [fileids]
  54. return concat([self.open(f).read() for f in fileids])
  55. def _entry(self, tree):
  56. elts = []
  57. for lexelt in tree.findall("lexelt"):
  58. for inst in lexelt.findall("instance"):
  59. sense = inst[0].attrib["senseid"]
  60. context = [(w.text, w.attrib["pos"]) for w in inst[1]]
  61. elts.append((sense, context))
  62. return elts
  63. class SensevalCorpusView(StreamBackedCorpusView):
  64. def __init__(self, fileid, encoding):
  65. StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
  66. self._word_tokenizer = WhitespaceTokenizer()
  67. self._lexelt_starts = [0] # list of streampos
  68. self._lexelts = [None] # list of lexelt names
  69. def read_block(self, stream):
  70. # Decide which lexical element we're in.
  71. lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell()) - 1
  72. lexelt = self._lexelts[lexelt_num]
  73. instance_lines = []
  74. in_instance = False
  75. while True:
  76. line = stream.readline()
  77. if line == "":
  78. assert instance_lines == []
  79. return []
  80. # Start of a lexical element?
  81. if line.lstrip().startswith("<lexelt"):
  82. lexelt_num += 1
  83. m = re.search("item=(\"[^\"]+\"|'[^']+')", line)
  84. assert m is not None # <lexelt> has no 'item=...'
  85. lexelt = m.group(1)[1:-1]
  86. if lexelt_num < len(self._lexelts):
  87. assert lexelt == self._lexelts[lexelt_num]
  88. else:
  89. self._lexelts.append(lexelt)
  90. self._lexelt_starts.append(stream.tell())
  91. # Start of an instance?
  92. if line.lstrip().startswith("<instance"):
  93. assert instance_lines == []
  94. in_instance = True
  95. # Body of an instance?
  96. if in_instance:
  97. instance_lines.append(line)
  98. # End of an instance?
  99. if line.lstrip().startswith("</instance"):
  100. xml_block = "\n".join(instance_lines)
  101. xml_block = _fixXML(xml_block)
  102. inst = ElementTree.fromstring(xml_block)
  103. return [self._parse_instance(inst, lexelt)]
  104. def _parse_instance(self, instance, lexelt):
  105. senses = []
  106. context = []
  107. position = None
  108. for child in instance:
  109. if child.tag == "answer":
  110. senses.append(child.attrib["senseid"])
  111. elif child.tag == "context":
  112. context += self._word_tokenizer.tokenize(child.text)
  113. for cword in child:
  114. if cword.tag == "compound":
  115. cword = cword[0] # is this ok to do?
  116. if cword.tag == "head":
  117. # Some santiy checks:
  118. assert position is None, "head specified twice"
  119. assert cword.text.strip() or len(cword) == 1
  120. assert not (cword.text.strip() and len(cword) == 1)
  121. # Record the position of the head:
  122. position = len(context)
  123. # Addd on the head word itself:
  124. if cword.text.strip():
  125. context.append(cword.text.strip())
  126. elif cword[0].tag == "wf":
  127. context.append((cword[0].text, cword[0].attrib["pos"]))
  128. if cword[0].tail:
  129. context += self._word_tokenizer.tokenize(cword[0].tail)
  130. else:
  131. assert False, "expected CDATA or wf in <head>"
  132. elif cword.tag == "wf":
  133. context.append((cword.text, cword.attrib["pos"]))
  134. elif cword.tag == "s":
  135. pass # Sentence boundary marker.
  136. else:
  137. print("ACK", cword.tag)
  138. assert False, "expected CDATA or <wf> or <head>"
  139. if cword.tail:
  140. context += self._word_tokenizer.tokenize(cword.tail)
  141. else:
  142. assert False, "unexpected tag %s" % child.tag
  143. return SensevalInstance(lexelt, position, context, senses)
  144. def _fixXML(text):
  145. """
  146. Fix the various issues with Senseval pseudo-XML.
  147. """
  148. # <~> or <^> => ~ or ^
  149. text = re.sub(r"<([~\^])>", r"\1", text)
  150. # fix lone &
  151. text = re.sub(r"(\s+)\&(\s+)", r"\1&amp;\2", text)
  152. # fix """
  153. text = re.sub(r'"""', "'\"'", text)
  154. # fix <s snum=dd> => <s snum="dd"/>
  155. text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text)
  156. # fix foreign word tag
  157. text = re.sub(r"<\&frasl>\s*<p[^>]*>", "FRASL", text)
  158. # remove <&I .>
  159. text = re.sub(r"<\&I[^>]*>", "", text)
  160. # fix <{word}>
  161. text = re.sub(r"<{([^}]+)}>", r"\1", text)
  162. # remove <@>, <p>, </p>
  163. text = re.sub(r"<(@|/?p)>", r"", text)
  164. # remove <&M .> and <&T .> and <&Ms .>
  165. text = re.sub(r"<&\w+ \.>", r"", text)
  166. # remove <!DOCTYPE... > lines
  167. text = re.sub(r"<!DOCTYPE[^>]*>", r"", text)
  168. # remove <[hi]> and <[/p]> etc
  169. text = re.sub(r"<\[\/?[^>]+\]*>", r"", text)
  170. # take the thing out of the brackets: <&hellip;>
  171. text = re.sub(r"<(\&\w+;)>", r"\1", text)
  172. # and remove the & for those patterns that aren't regular XML
  173. text = re.sub(r"&(?!amp|gt|lt|apos|quot)", r"", text)
  174. # fix 'abc <p="foo"/>' style tags - now <wf pos="foo">abc</wf>
  175. text = re.sub(
  176. r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>', r' <wf pos="\2">\1</wf>', text
  177. )
  178. text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text)
  179. return text