gbrault
/
jupytersketcher
mirror da https://github.com/gbrault/jupytersketcher.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
							# Natural Language Toolkit: Penn Treebank Reader
#
# Copyright (C) 2001-2020 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
#         Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for corpora that consist of parenthesis-delineated parse trees.
"""

import sys

from nltk.tree import Tree
from nltk.tag import map_tag

from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *

# we use [^\s()]+ instead of \S+? to avoid matching ()
SORTTAGWRD = re.compile(r"\((\d+) ([^\s()]+) ([^\s()]+)\)")
TAGWORD = re.compile(r"\(([^\s()]+) ([^\s()]+)\)")
WORD = re.compile(r"\([^\s()]+ ([^\s()]+)\)")
EMPTY_BRACKETS = re.compile(r"\s*\(\s*\(")


class BracketParseCorpusReader(SyntaxCorpusReader):
    """
    Reader for corpora that consist of parenthesis-delineated parse trees,
    like those found in the "combined" section of the Penn Treebank,
    e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))".

    """

    def __init__(
        self,
        root,
        fileids,
        comment_char=None,
        detect_blocks="unindented_paren",
        encoding="utf8",
        tagset=None,
    ):
        """
        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        :param comment_char: The character which can appear at the start of
            a line to indicate that the rest of the line is a comment.
        :param detect_blocks: The method that is used to find blocks
          in the corpus; can be 'unindented_paren' (every unindented
          parenthesis starts a new parse) or 'sexpr' (brackets are
          matched).
        :param tagset: The name of the tagset used by this corpus, to be used
              for normalizing or converting the POS tags returned by the
              tagged_...() methods.
        """
        # FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
        #       from CorpusReader?
        CorpusReader.__init__(self, root, fileids, encoding)
        self._comment_char = comment_char
        self._detect_blocks = detect_blocks
        self._tagset = tagset

    def _read_block(self, stream):
        if self._detect_blocks == "sexpr":
            return read_sexpr_block(stream, comment_char=self._comment_char)
        elif self._detect_blocks == "blankline":
            return read_blankline_block(stream)
        elif self._detect_blocks == "unindented_paren":
            # Tokens start with unindented left parens.
            toks = read_regexp_block(stream, start_re=r"^\(")
            # Strip any comments out of the tokens.
            if self._comment_char:
                toks = [
                    re.sub("(?m)^%s.*" % re.escape(self._comment_char), "", tok)
                    for tok in toks
                ]
            return toks
        else:
            assert 0, "bad block type"

    def _normalize(self, t):
        # Replace leaves of the form (!), (,), with (! !), (, ,)
        t = re.sub(r"\((.)\)", r"(\1 \1)", t)
        # Replace leaves of the form (tag word root) with (tag word)
        t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t)
        return t

    def _parse(self, t):
        try:
            tree = Tree.fromstring(self._normalize(t))
            # If there's an empty node at the top, strip it off
            if tree.label() == '' and len(tree) == 1:
                return tree[0]
            else:
                return tree

        except ValueError as e:
            sys.stderr.write("Bad tree detected; trying to recover...\n")
            # Try to recover, if we can:
            if e.args == ("mismatched parens",):
                for n in range(1, 5):
                    try:
                        v = Tree(self._normalize(t + ")" * n))
                        sys.stderr.write(
                            "  Recovered by adding %d close " "paren(s)\n" % n
                        )
                        return v
                    except ValueError:
                        pass
            # Try something else:
            sys.stderr.write("  Recovered by returning a flat parse.\n")
            # sys.stderr.write(' '.join(t.split())+'\n')
            return Tree("S", self._tag(t))

    def _tag(self, t, tagset=None):
        tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))]
        if tagset and tagset != self._tagset:
            tagged_sent = [
                (w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent
            ]
        return tagged_sent

    def _word(self, t):
        return WORD.findall(self._normalize(t))


class CategorizedBracketParseCorpusReader(
    CategorizedCorpusReader, BracketParseCorpusReader
):
    """
    A reader for parsed corpora whose documents are
    divided into categories based on their file identifiers.
    @author: Nathan Schneider <nschneid@cs.cmu.edu>
    """

    def __init__(self, *args, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
        the L{CategorizedCorpusReader constructor
        <CategorizedCorpusReader.__init__>}.  The remaining arguments
        are passed to the L{BracketParseCorpusReader constructor
        <BracketParseCorpusReader.__init__>}.
        """
        CategorizedCorpusReader.__init__(self, kwargs)
        BracketParseCorpusReader.__init__(self, *args, **kwargs)

    def _resolve(self, fileids, categories):
        if fileids is not None and categories is not None:
            raise ValueError("Specify fileids or categories, not both")
        if categories is not None:
            return self.fileids(categories)
        else:
            return fileids

    def raw(self, fileids=None, categories=None):
        return BracketParseCorpusReader.raw(self, self._resolve(fileids, categories))

    def words(self, fileids=None, categories=None):
        return BracketParseCorpusReader.words(self, self._resolve(fileids, categories))

    def sents(self, fileids=None, categories=None):
        return BracketParseCorpusReader.sents(self, self._resolve(fileids, categories))

    def paras(self, fileids=None, categories=None):
        return BracketParseCorpusReader.paras(self, self._resolve(fileids, categories))

    def tagged_words(self, fileids=None, categories=None, tagset=None):
        return BracketParseCorpusReader.tagged_words(
            self, self._resolve(fileids, categories), tagset
        )

    def tagged_sents(self, fileids=None, categories=None, tagset=None):
        return BracketParseCorpusReader.tagged_sents(
            self, self._resolve(fileids, categories), tagset
        )

    def tagged_paras(self, fileids=None, categories=None, tagset=None):
        return BracketParseCorpusReader.tagged_paras(
            self, self._resolve(fileids, categories), tagset
        )

    def parsed_words(self, fileids=None, categories=None):
        return BracketParseCorpusReader.parsed_words(
            self, self._resolve(fileids, categories)
        )

    def parsed_sents(self, fileids=None, categories=None):
        return BracketParseCorpusReader.parsed_sents(
            self, self._resolve(fileids, categories)
        )

    def parsed_paras(self, fileids=None, categories=None):
        return BracketParseCorpusReader.parsed_paras(
            self, self._resolve(fileids, categories)
        )


class AlpinoCorpusReader(BracketParseCorpusReader):
    """
    Reader for the Alpino Dutch Treebank.
    This corpus has a lexical breakdown structure embedded, as read by _parse
    Unfortunately this puts punctuation and some other words out of the sentence
    order in the xml element tree. This is no good for tag_ and word_
    _tag and _word will be overridden to use a non-default new parameter 'ordered'
    to the overridden _normalize function. The _parse function can then remain
    untouched.
    """

    def __init__(self, root, encoding="ISO-8859-1", tagset=None):
        BracketParseCorpusReader.__init__(
            self,
            root,
            "alpino\.xml",
            detect_blocks="blankline",
            encoding=encoding,
            tagset=tagset,
        )

    def _normalize(self, t, ordered=False):
        """Normalize the xml sentence element in t.
        The sentence elements <alpino_ds>, although embedded in a few overall
        xml elements, are seperated by blank lines. That's how the reader can
        deliver them one at a time.
        Each sentence has a few category subnodes that are of no use to us.
        The remaining word nodes may or may not appear in the proper order.
        Each word node has attributes, among which:
        - begin : the position of the word in the sentence
        - pos   : Part of Speech: the Tag
        - word  : the actual word
        The return value is a string with all xml elementes replaced by
        clauses: either a cat clause with nested clauses, or a word clause.
        The order of the bracket clauses closely follows the xml.
        If ordered == True, the word clauses include an order sequence number.
        If ordered == False, the word clauses only have pos and word parts.
        """
        if t[:10] != "<alpino_ds":
            return ""
        # convert XML to sexpr notation
        t = re.sub(r'  <node .*? cat="(\w+)".*>', r"(\1", t)
        if ordered:
            t = re.sub(
                r'  <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>',
                r"(\1 \2 \3)",
                t,
            )
        else:
            t = re.sub(r'  <node .*?pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2)", t)
        t = re.sub(r"  </node>", r")", t)
        t = re.sub(r"<sentence>.*</sentence>", r"", t)
        t = re.sub(r"</?alpino_ds.*>", r"", t)
        return t

    def _tag(self, t, tagset=None):
        tagged_sent = [
            (int(o), w, p)
            for (o, p, w) in SORTTAGWRD.findall(self._normalize(t, ordered=True))
        ]
        tagged_sent.sort()
        if tagset and tagset != self._tagset:
            tagged_sent = [
                (w, map_tag(self._tagset, tagset, p)) for (o, w, p) in tagged_sent
            ]
        else:
            tagged_sent = [(w, p) for (o, w, p) in tagged_sent]
        return tagged_sent

    def _word(self, t):
        """Return a correctly ordered list if words"""
        tagged_sent = self._tag(t)
        return [w for (w, p) in tagged_sent]