| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296 |
- # Natural Language Toolkit: SemCor Corpus Reader
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Nathan Schneider <nschneid@cs.cmu.edu>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Corpus reader for the SemCor Corpus.
- """
- __docformat__ = "epytext en"
- from nltk.corpus.reader.api import *
- from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
- from nltk.tree import Tree
- class SemcorCorpusReader(XMLCorpusReader):
- """
- Corpus reader for the SemCor Corpus.
- For access to the complete XML data structure, use the ``xml()``
- method. For access to simple word lists and tagged word lists, use
- ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
- """
- def __init__(self, root, fileids, wordnet, lazy=True):
- XMLCorpusReader.__init__(self, root, fileids)
- self._lazy = lazy
- self._wordnet = wordnet
- def words(self, fileids=None):
- """
- :return: the given file(s) as a list of words and punctuation symbols.
- :rtype: list(str)
- """
- return self._items(fileids, "word", False, False, False)
- def chunks(self, fileids=None):
- """
- :return: the given file(s) as a list of chunks,
- each of which is a list of words and punctuation symbols
- that form a unit.
- :rtype: list(list(str))
- """
- return self._items(fileids, "chunk", False, False, False)
- def tagged_chunks(self, fileids=None, tag=("pos" or "sem" or "both")):
- """
- :return: the given file(s) as a list of tagged chunks, represented
- in tree form.
- :rtype: list(Tree)
- :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
- to indicate the kind of tags to include. Semantic tags consist of
- WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
- without a specific entry in WordNet. (Named entities of type 'other'
- have no lemma. Other chunks not in WordNet have no semantic tag.
- Punctuation tokens have `None` for their part of speech tag.)
- """
- return self._items(fileids, "chunk", False, tag != "sem", tag != "pos")
- def sents(self, fileids=None):
- """
- :return: the given file(s) as a list of sentences, each encoded
- as a list of word strings.
- :rtype: list(list(str))
- """
- return self._items(fileids, "word", True, False, False)
- def chunk_sents(self, fileids=None):
- """
- :return: the given file(s) as a list of sentences, each encoded
- as a list of chunks.
- :rtype: list(list(list(str)))
- """
- return self._items(fileids, "chunk", True, False, False)
- def tagged_sents(self, fileids=None, tag=("pos" or "sem" or "both")):
- """
- :return: the given file(s) as a list of sentences. Each sentence
- is represented as a list of tagged chunks (in tree form).
- :rtype: list(list(Tree))
- :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
- to indicate the kind of tags to include. Semantic tags consist of
- WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
- without a specific entry in WordNet. (Named entities of type 'other'
- have no lemma. Other chunks not in WordNet have no semantic tag.
- Punctuation tokens have `None` for their part of speech tag.)
- """
- return self._items(fileids, "chunk", True, tag != "sem", tag != "pos")
- def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):
- if unit == "word" and not bracket_sent:
- # the result of the SemcorWordView may be a multiword unit, so the
- # LazyConcatenation will make sure the sentence is flattened
- _ = lambda *args: LazyConcatenation(
- (SemcorWordView if self._lazy else self._words)(*args)
- )
- else:
- _ = SemcorWordView if self._lazy else self._words
- return concat(
- [
- _(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet)
- for fileid in self.abspaths(fileids)
- ]
- )
- def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
- """
- Helper used to implement the view methods -- returns a list of
- tokens, (segmented) words, chunks, or sentences. The tokens
- and chunks may optionally be tagged (with POS and sense
- information).
- :param fileid: The name of the underlying file.
- :param unit: One of `'token'`, `'word'`, or `'chunk'`.
- :param bracket_sent: If true, include sentence bracketing.
- :param pos_tag: Whether to include part-of-speech tags.
- :param sem_tag: Whether to include semantic tags, namely WordNet lemma
- and OOV named entity status.
- """
- assert unit in ("token", "word", "chunk")
- result = []
- xmldoc = ElementTree.parse(fileid).getroot()
- for xmlsent in xmldoc.findall(".//s"):
- sent = []
- for xmlword in _all_xmlwords_in(xmlsent):
- itm = SemcorCorpusReader._word(
- xmlword, unit, pos_tag, sem_tag, self._wordnet
- )
- if unit == "word":
- sent.extend(itm)
- else:
- sent.append(itm)
- if bracket_sent:
- result.append(SemcorSentence(xmlsent.attrib["snum"], sent))
- else:
- result.extend(sent)
- assert None not in result
- return result
- @staticmethod
- def _word(xmlword, unit, pos_tag, sem_tag, wordnet):
- tkn = xmlword.text
- if not tkn:
- tkn = "" # fixes issue 337?
- lemma = xmlword.get("lemma", tkn) # lemma or NE class
- lexsn = xmlword.get("lexsn") # lex_sense (locator for the lemma's sense)
- if lexsn is not None:
- sense_key = lemma + "%" + lexsn
- wnpos = ("n", "v", "a", "r", "s")[
- int(lexsn.split(":")[0]) - 1
- ] # see http://wordnet.princeton.edu/man/senseidx.5WN.html
- else:
- sense_key = wnpos = None
- redef = xmlword.get(
- "rdf", tkn
- ) # redefinition--this indicates the lookup string
- # does not exactly match the enclosed string, e.g. due to typographical adjustments
- # or discontinuity of a multiword expression. If a redefinition has occurred,
- # the "rdf" attribute holds its inflected form and "lemma" holds its lemma.
- # For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
- sensenum = xmlword.get("wnsn") # WordNet sense number
- isOOVEntity = "pn" in xmlword.keys() # a "personal name" (NE) not in WordNet
- pos = xmlword.get(
- "pos"
- ) # part of speech for the whole chunk (None for punctuation)
- if unit == "token":
- if not pos_tag and not sem_tag:
- itm = tkn
- else:
- itm = (
- (tkn,)
- + ((pos,) if pos_tag else ())
- + ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ())
- )
- return itm
- else:
- ww = tkn.split("_") # TODO: case where punctuation intervenes in MWE
- if unit == "word":
- return ww
- else:
- if sensenum is not None:
- try:
- sense = wordnet.lemma_from_key(sense_key) # Lemma object
- except Exception:
- # cannot retrieve the wordnet.Lemma object. possible reasons:
- # (a) the wordnet corpus is not downloaded;
- # (b) a nonexistant sense is annotated: e.g., such.s.00 triggers:
- # nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'
- # solution: just use the lemma name as a string
- try:
- sense = "%s.%s.%02d" % (
- lemma,
- wnpos,
- int(sensenum),
- ) # e.g.: reach.v.02
- except ValueError:
- sense = (
- lemma + "." + wnpos + "." + sensenum
- ) # e.g. the sense number may be "2;1"
- bottom = [Tree(pos, ww)] if pos_tag else ww
- if sem_tag and isOOVEntity:
- if sensenum is not None:
- return Tree(sense, [Tree("NE", bottom)])
- else: # 'other' NE
- return Tree("NE", bottom)
- elif sem_tag and sensenum is not None:
- return Tree(sense, bottom)
- elif pos_tag:
- return bottom[0]
- else:
- return bottom # chunk as a list
- def _all_xmlwords_in(elt, result=None):
- if result is None:
- result = []
- for child in elt:
- if child.tag in ("wf", "punc"):
- result.append(child)
- else:
- _all_xmlwords_in(child, result)
- return result
- class SemcorSentence(list):
- """
- A list of words, augmented by an attribute ``num`` used to record
- the sentence identifier (the ``n`` attribute from the XML).
- """
- def __init__(self, num, items):
- self.num = num
- list.__init__(self, items)
- class SemcorWordView(XMLCorpusView):
- """
- A stream backed corpus view specialized for use with the BNC corpus.
- """
- def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet):
- """
- :param fileid: The name of the underlying file.
- :param unit: One of `'token'`, `'word'`, or `'chunk'`.
- :param bracket_sent: If true, include sentence bracketing.
- :param pos_tag: Whether to include part-of-speech tags.
- :param sem_tag: Whether to include semantic tags, namely WordNet lemma
- and OOV named entity status.
- """
- if bracket_sent:
- tagspec = ".*/s"
- else:
- tagspec = ".*/s/(punc|wf)"
- self._unit = unit
- self._sent = bracket_sent
- self._pos_tag = pos_tag
- self._sem_tag = sem_tag
- self._wordnet = wordnet
- XMLCorpusView.__init__(self, fileid, tagspec)
- def handle_elt(self, elt, context):
- if self._sent:
- return self.handle_sent(elt)
- else:
- return self.handle_word(elt)
- def handle_word(self, elt):
- return SemcorCorpusReader._word(
- elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet
- )
- def handle_sent(self, elt):
- sent = []
- for child in elt:
- if child.tag in ("wf", "punc"):
- itm = self.handle_word(child)
- if self._unit == "word":
- sent.extend(itm)
- else:
- sent.append(itm)
- else:
- raise ValueError("Unexpected element %s" % child.tag)
- return SemcorSentence(elt.attrib["snum"], sent)
|