| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: Interface to the Stanford Parser
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Steven Xu <xxu@student.unimelb.edu.au>
- #
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- import tempfile
- import os
- import warnings
- from unittest import skip
- from subprocess import PIPE
- from nltk.internals import (
- find_jar_iter,
- config_java,
- java,
- _java_options,
- find_jars_within_path,
- )
- from nltk.parse.api import ParserI
- from nltk.parse.dependencygraph import DependencyGraph
- from nltk.tree import Tree
- _stanford_url = "https://nlp.stanford.edu/software/lex-parser.shtml"
- class GenericStanfordParser(ParserI):
- """Interface to the Stanford Parser"""
- _MODEL_JAR_PATTERN = r"stanford-parser-(\d+)(\.(\d+))+-models\.jar"
- _JAR = r"stanford-parser\.jar"
- _MAIN_CLASS = "edu.stanford.nlp.parser.lexparser.LexicalizedParser"
- _USE_STDIN = False
- _DOUBLE_SPACED_OUTPUT = False
- def __init__(
- self,
- path_to_jar=None,
- path_to_models_jar=None,
- model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
- encoding="utf8",
- verbose=False,
- java_options="-mx4g",
- corenlp_options="",
- ):
- # find the most recent code and model jar
- stanford_jar = max(
- find_jar_iter(
- self._JAR,
- path_to_jar,
- env_vars=("STANFORD_PARSER", "STANFORD_CORENLP"),
- searchpath=(),
- url=_stanford_url,
- verbose=verbose,
- is_regex=True,
- ),
- key=lambda model_path: os.path.dirname(model_path),
- )
- model_jar = max(
- find_jar_iter(
- self._MODEL_JAR_PATTERN,
- path_to_models_jar,
- env_vars=("STANFORD_MODELS", "STANFORD_CORENLP"),
- searchpath=(),
- url=_stanford_url,
- verbose=verbose,
- is_regex=True,
- ),
- key=lambda model_path: os.path.dirname(model_path),
- )
- # self._classpath = (stanford_jar, model_jar)
- # Adding logging jar files to classpath
- stanford_dir = os.path.split(stanford_jar)[0]
- self._classpath = tuple([model_jar] + find_jars_within_path(stanford_dir))
- self.model_path = model_path
- self._encoding = encoding
- self.corenlp_options = corenlp_options
- self.java_options = java_options
- def _parse_trees_output(self, output_):
- res = []
- cur_lines = []
- cur_trees = []
- blank = False
- for line in output_.splitlines(False):
- if line == "":
- if blank:
- res.append(iter(cur_trees))
- cur_trees = []
- blank = False
- elif self._DOUBLE_SPACED_OUTPUT:
- cur_trees.append(self._make_tree("\n".join(cur_lines)))
- cur_lines = []
- blank = True
- else:
- res.append(iter([self._make_tree("\n".join(cur_lines))]))
- cur_lines = []
- else:
- cur_lines.append(line)
- blank = False
- return iter(res)
- def parse_sents(self, sentences, verbose=False):
- """
- Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
- list where each sentence is a list of words.
- Each sentence will be automatically tagged with this StanfordParser instance's
- tagger.
- If whitespaces exists inside a token, then the token will be treated as
- separate tokens.
- :param sentences: Input sentences to parse
- :type sentences: list(list(str))
- :rtype: iter(iter(Tree))
- """
- cmd = [
- self._MAIN_CLASS,
- "-model",
- self.model_path,
- "-sentences",
- "newline",
- "-outputFormat",
- self._OUTPUT_FORMAT,
- "-tokenized",
- "-escaper",
- "edu.stanford.nlp.process.PTBEscapingProcessor",
- ]
- return self._parse_trees_output(
- self._execute(
- cmd, "\n".join(" ".join(sentence) for sentence in sentences), verbose
- )
- )
- def raw_parse(self, sentence, verbose=False):
- """
- Use StanfordParser to parse a sentence. Takes a sentence as a string;
- before parsing, it will be automatically tokenized and tagged by
- the Stanford Parser.
- :param sentence: Input sentence to parse
- :type sentence: str
- :rtype: iter(Tree)
- """
- return next(self.raw_parse_sents([sentence], verbose))
- def raw_parse_sents(self, sentences, verbose=False):
- """
- Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
- list of strings.
- Each sentence will be automatically tokenized and tagged by the Stanford Parser.
- :param sentences: Input sentences to parse
- :type sentences: list(str)
- :rtype: iter(iter(Tree))
- """
- cmd = [
- self._MAIN_CLASS,
- "-model",
- self.model_path,
- "-sentences",
- "newline",
- "-outputFormat",
- self._OUTPUT_FORMAT,
- ]
- return self._parse_trees_output(
- self._execute(cmd, "\n".join(sentences), verbose)
- )
- def tagged_parse(self, sentence, verbose=False):
- """
- Use StanfordParser to parse a sentence. Takes a sentence as a list of
- (word, tag) tuples; the sentence must have already been tokenized and
- tagged.
- :param sentence: Input sentence to parse
- :type sentence: list(tuple(str, str))
- :rtype: iter(Tree)
- """
- return next(self.tagged_parse_sents([sentence], verbose))
- def tagged_parse_sents(self, sentences, verbose=False):
- """
- Use StanfordParser to parse multiple sentences. Takes multiple sentences
- where each sentence is a list of (word, tag) tuples.
- The sentences must have already been tokenized and tagged.
- :param sentences: Input sentences to parse
- :type sentences: list(list(tuple(str, str)))
- :rtype: iter(iter(Tree))
- """
- tag_separator = "/"
- cmd = [
- self._MAIN_CLASS,
- "-model",
- self.model_path,
- "-sentences",
- "newline",
- "-outputFormat",
- self._OUTPUT_FORMAT,
- "-tokenized",
- "-tagSeparator",
- tag_separator,
- "-tokenizerFactory",
- "edu.stanford.nlp.process.WhitespaceTokenizer",
- "-tokenizerMethod",
- "newCoreLabelTokenizerFactory",
- ]
- # We don't need to escape slashes as "splitting is done on the last instance of the character in the token"
- return self._parse_trees_output(
- self._execute(
- cmd,
- "\n".join(
- " ".join(tag_separator.join(tagged) for tagged in sentence)
- for sentence in sentences
- ),
- verbose,
- )
- )
- def _execute(self, cmd, input_, verbose=False):
- encoding = self._encoding
- cmd.extend(["-encoding", encoding])
- if self.corenlp_options:
- cmd.append(self.corenlp_options)
- default_options = " ".join(_java_options)
- # Configure java.
- config_java(options=self.java_options, verbose=verbose)
- # Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
- with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
- # Write the actual sentences to the temporary input file
- if isinstance(input_, str) and encoding:
- input_ = input_.encode(encoding)
- input_file.write(input_)
- input_file.flush()
- # Run the tagger and get the output.
- if self._USE_STDIN:
- input_file.seek(0)
- stdout, stderr = java(
- cmd,
- classpath=self._classpath,
- stdin=input_file,
- stdout=PIPE,
- stderr=PIPE,
- )
- else:
- cmd.append(input_file.name)
- stdout, stderr = java(
- cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE
- )
- stdout = stdout.replace(b"\xc2\xa0", b" ")
- stdout = stdout.replace(b"\x00\xa0", b" ")
- stdout = stdout.decode(encoding)
- os.unlink(input_file.name)
- # Return java configurations to their default values.
- config_java(options=default_options, verbose=False)
- return stdout
- class StanfordParser(GenericStanfordParser):
- """
- >>> parser=StanfordParser(
- ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
- ... )
- >>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE
- [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
- Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
- Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])]
- >>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents((
- ... "the quick brown fox jumps over the lazy dog",
- ... "the quick grey wolf jumps over the lazy fox"
- ... ))], []) # doctest: +NORMALIZE_WHITESPACE
- [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
- Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
- Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP',
- [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP',
- [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']),
- Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])]
- >>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents((
- ... "I 'm a dog".split(),
- ... "This is my friends ' cat ( the tabby )".split(),
- ... ))], []) # doctest: +NORMALIZE_WHITESPACE
- [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]),
- Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP',
- [Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']),
- Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', [Tree('', []),
- Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', [])])])])])])])]
- >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents((
- ... (
- ... ("The", "DT"),
- ... ("quick", "JJ"),
- ... ("brown", "JJ"),
- ... ("fox", "NN"),
- ... ("jumped", "VBD"),
- ... ("over", "IN"),
- ... ("the", "DT"),
- ... ("lazy", "JJ"),
- ... ("dog", "NN"),
- ... (".", "."),
- ... ),
- ... ))],[]) # doctest: +NORMALIZE_WHITESPACE
- [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
- Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP',
- [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]
- """
- _OUTPUT_FORMAT = "penn"
- def __init__(self, *args, **kwargs):
- warnings.warn(
- "The StanfordParser will be deprecated\n"
- "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.",
- DeprecationWarning,
- stacklevel=2,
- )
- super(StanfordParser, self).__init__(*args, **kwargs)
- def _make_tree(self, result):
- return Tree.fromstring(result)
- class StanfordDependencyParser(GenericStanfordParser):
- """
- >>> dep_parser=StanfordDependencyParser(
- ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
- ... )
- >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
- [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])]
- >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
- [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
- ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
- ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
- ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
- >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
- ... "The quick brown fox jumps over the lazy dog.",
- ... "The quick grey wolf jumps over the lazy fox."
- ... ))], []) # doctest: +NORMALIZE_WHITESPACE
- [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]),
- Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])]
- >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
- ... "I 'm a dog".split(),
- ... "This is my friends ' cat ( the tabby )".split(),
- ... ))], []) # doctest: +NORMALIZE_WHITESPACE
- [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])]
- >>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents((
- ... (
- ... ("The", "DT"),
- ... ("quick", "JJ"),
- ... ("brown", "JJ"),
- ... ("fox", "NN"),
- ... ("jumped", "VBD"),
- ... ("over", "IN"),
- ... ("the", "DT"),
- ... ("lazy", "JJ"),
- ... ("dog", "NN"),
- ... (".", "."),
- ... ),
- ... ))],[]) # doctest: +NORMALIZE_WHITESPACE
- [[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
- ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
- ((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
- ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
- """
- _OUTPUT_FORMAT = "conll2007"
- def __init__(self, *args, **kwargs):
- warnings.warn(
- "The StanfordDependencyParser will be deprecated\n"
- "Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
- DeprecationWarning,
- stacklevel=2,
- )
- super(StanfordDependencyParser, self).__init__(*args, **kwargs)
- def _make_tree(self, result):
- return DependencyGraph(result, top_relation_label="root")
- class StanfordNeuralDependencyParser(GenericStanfordParser):
- """
- >>> from nltk.parse.stanford import StanfordNeuralDependencyParser
- >>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx4g')
- >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
- [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.'])]
- >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
- [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det',
- (u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'),
- u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')),
- ((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det',
- (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ')), ((u'jumps', u'VBZ'),
- u'punct', (u'.', u'.'))]]
- >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
- ... "The quick brown fox jumps over the lazy dog.",
- ... "The quick grey wolf jumps over the lazy fox."
- ... ))], []) # doctest: +NORMALIZE_WHITESPACE
- [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over',
- 'the', 'lazy']), '.']), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']),
- Tree('fox', ['over', 'the', 'lazy']), '.'])]
- >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
- ... "I 'm a dog".split(),
- ... "This is my friends ' cat ( the tabby )".split(),
- ... ))], []) # doctest: +NORMALIZE_WHITESPACE
- [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends',
- ['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])]
- """
- _OUTPUT_FORMAT = "conll"
- _MAIN_CLASS = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
- _JAR = r"stanford-corenlp-(\d+)(\.(\d+))+\.jar"
- _MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)(\.(\d+))+-models\.jar"
- _USE_STDIN = True
- _DOUBLE_SPACED_OUTPUT = True
- def __init__(self, *args, **kwargs):
- warnings.warn(
- "The StanfordNeuralDependencyParser will be deprecated\n"
- "Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
- DeprecationWarning,
- stacklevel=2,
- )
- super(StanfordNeuralDependencyParser, self).__init__(*args, **kwargs)
- self.corenlp_options += "-annotators tokenize,ssplit,pos,depparse"
- def tagged_parse_sents(self, sentences, verbose=False):
- """
- Currently unimplemented because the neural dependency parser (and
- the StanfordCoreNLP pipeline class) doesn't support passing in pre-
- tagged tokens.
- """
- raise NotImplementedError(
- "tagged_parse[_sents] is not supported by "
- "StanfordNeuralDependencyParser; use "
- "parse[_sents] or raw_parse[_sents] instead."
- )
- def _make_tree(self, result):
- return DependencyGraph(result, top_relation_label="ROOT")
- @skip("doctests from nltk.parse.stanford are skipped because it's deprecated")
- def setup_module(module):
- from nose import SkipTest
- try:
- StanfordParser(
- model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
- )
- StanfordNeuralDependencyParser()
- except LookupError:
- raise SkipTest(
- "doctests from nltk.parse.stanford are skipped because one of the stanford parser or CoreNLP jars doesn't exist"
- )
|