| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207 |
- # Natural Language Toolkit: Senseval 2 Corpus Reader
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
- # Steven Bird <stevenbird1@gmail.com> (modifications)
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Read from the Senseval 2 Corpus.
- SENSEVAL [http://www.senseval.org/]
- Evaluation exercises for Word Sense Disambiguation.
- Organized by ACL-SIGLEX [http://www.siglex.org/]
- Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,
- http://www.d.umn.edu/~tpederse/data.html
- Distributed with permission.
- The NLTK version of the Senseval 2 files uses well-formed XML.
- Each instance of the ambiguous words "hard", "interest", "line", and "serve"
- is tagged with a sense identifier, and supplied with context.
- """
- import re
- from xml.etree import ElementTree
- from nltk.tokenize import *
- from nltk.corpus.reader.util import *
- from nltk.corpus.reader.api import *
- class SensevalInstance(object):
- def __init__(self, word, position, context, senses):
- self.word = word
- self.senses = tuple(senses)
- self.position = position
- self.context = context
- def __repr__(self):
- return "SensevalInstance(word=%r, position=%r, " "context=%r, senses=%r)" % (
- self.word,
- self.position,
- self.context,
- self.senses,
- )
- class SensevalCorpusReader(CorpusReader):
- def instances(self, fileids=None):
- return concat(
- [
- SensevalCorpusView(fileid, enc)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
- def raw(self, fileids=None):
- """
- :return: the text contents of the given fileids, as a single string.
- """
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
- return concat([self.open(f).read() for f in fileids])
- def _entry(self, tree):
- elts = []
- for lexelt in tree.findall("lexelt"):
- for inst in lexelt.findall("instance"):
- sense = inst[0].attrib["senseid"]
- context = [(w.text, w.attrib["pos"]) for w in inst[1]]
- elts.append((sense, context))
- return elts
- class SensevalCorpusView(StreamBackedCorpusView):
- def __init__(self, fileid, encoding):
- StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
- self._word_tokenizer = WhitespaceTokenizer()
- self._lexelt_starts = [0] # list of streampos
- self._lexelts = [None] # list of lexelt names
- def read_block(self, stream):
- # Decide which lexical element we're in.
- lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell()) - 1
- lexelt = self._lexelts[lexelt_num]
- instance_lines = []
- in_instance = False
- while True:
- line = stream.readline()
- if line == "":
- assert instance_lines == []
- return []
- # Start of a lexical element?
- if line.lstrip().startswith("<lexelt"):
- lexelt_num += 1
- m = re.search("item=(\"[^\"]+\"|'[^']+')", line)
- assert m is not None # <lexelt> has no 'item=...'
- lexelt = m.group(1)[1:-1]
- if lexelt_num < len(self._lexelts):
- assert lexelt == self._lexelts[lexelt_num]
- else:
- self._lexelts.append(lexelt)
- self._lexelt_starts.append(stream.tell())
- # Start of an instance?
- if line.lstrip().startswith("<instance"):
- assert instance_lines == []
- in_instance = True
- # Body of an instance?
- if in_instance:
- instance_lines.append(line)
- # End of an instance?
- if line.lstrip().startswith("</instance"):
- xml_block = "\n".join(instance_lines)
- xml_block = _fixXML(xml_block)
- inst = ElementTree.fromstring(xml_block)
- return [self._parse_instance(inst, lexelt)]
- def _parse_instance(self, instance, lexelt):
- senses = []
- context = []
- position = None
- for child in instance:
- if child.tag == "answer":
- senses.append(child.attrib["senseid"])
- elif child.tag == "context":
- context += self._word_tokenizer.tokenize(child.text)
- for cword in child:
- if cword.tag == "compound":
- cword = cword[0] # is this ok to do?
- if cword.tag == "head":
- # Some santiy checks:
- assert position is None, "head specified twice"
- assert cword.text.strip() or len(cword) == 1
- assert not (cword.text.strip() and len(cword) == 1)
- # Record the position of the head:
- position = len(context)
- # Addd on the head word itself:
- if cword.text.strip():
- context.append(cword.text.strip())
- elif cword[0].tag == "wf":
- context.append((cword[0].text, cword[0].attrib["pos"]))
- if cword[0].tail:
- context += self._word_tokenizer.tokenize(cword[0].tail)
- else:
- assert False, "expected CDATA or wf in <head>"
- elif cword.tag == "wf":
- context.append((cword.text, cword.attrib["pos"]))
- elif cword.tag == "s":
- pass # Sentence boundary marker.
- else:
- print("ACK", cword.tag)
- assert False, "expected CDATA or <wf> or <head>"
- if cword.tail:
- context += self._word_tokenizer.tokenize(cword.tail)
- else:
- assert False, "unexpected tag %s" % child.tag
- return SensevalInstance(lexelt, position, context, senses)
- def _fixXML(text):
- """
- Fix the various issues with Senseval pseudo-XML.
- """
- # <~> or <^> => ~ or ^
- text = re.sub(r"<([~\^])>", r"\1", text)
- # fix lone &
- text = re.sub(r"(\s+)\&(\s+)", r"\1&\2", text)
- # fix """
- text = re.sub(r'"""', "'\"'", text)
- # fix <s snum=dd> => <s snum="dd"/>
- text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text)
- # fix foreign word tag
- text = re.sub(r"<\&frasl>\s*<p[^>]*>", "FRASL", text)
- # remove <&I .>
- text = re.sub(r"<\&I[^>]*>", "", text)
- # fix <{word}>
- text = re.sub(r"<{([^}]+)}>", r"\1", text)
- # remove <@>, <p>, </p>
- text = re.sub(r"<(@|/?p)>", r"", text)
- # remove <&M .> and <&T .> and <&Ms .>
- text = re.sub(r"<&\w+ \.>", r"", text)
- # remove <!DOCTYPE... > lines
- text = re.sub(r"<!DOCTYPE[^>]*>", r"", text)
- # remove <[hi]> and <[/p]> etc
- text = re.sub(r"<\[\/?[^>]+\]*>", r"", text)
- # take the thing out of the brackets: <…>
- text = re.sub(r"<(\&\w+;)>", r"\1", text)
- # and remove the & for those patterns that aren't regular XML
- text = re.sub(r"&(?!amp|gt|lt|apos|quot)", r"", text)
- # fix 'abc <p="foo"/>' style tags - now <wf pos="foo">abc</wf>
- text = re.sub(
- r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>', r' <wf pos="\2">\1</wf>', text
- )
- text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text)
- return text
|