stanford.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Interface to the Stanford Parser
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Author: Steven Xu <xxu@student.unimelb.edu.au>
  6. #
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. import tempfile
  10. import os
  11. import warnings
  12. from unittest import skip
  13. from subprocess import PIPE
  14. from nltk.internals import (
  15. find_jar_iter,
  16. config_java,
  17. java,
  18. _java_options,
  19. find_jars_within_path,
  20. )
  21. from nltk.parse.api import ParserI
  22. from nltk.parse.dependencygraph import DependencyGraph
  23. from nltk.tree import Tree
  24. _stanford_url = "https://nlp.stanford.edu/software/lex-parser.shtml"
  25. class GenericStanfordParser(ParserI):
  26. """Interface to the Stanford Parser"""
  27. _MODEL_JAR_PATTERN = r"stanford-parser-(\d+)(\.(\d+))+-models\.jar"
  28. _JAR = r"stanford-parser\.jar"
  29. _MAIN_CLASS = "edu.stanford.nlp.parser.lexparser.LexicalizedParser"
  30. _USE_STDIN = False
  31. _DOUBLE_SPACED_OUTPUT = False
  32. def __init__(
  33. self,
  34. path_to_jar=None,
  35. path_to_models_jar=None,
  36. model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
  37. encoding="utf8",
  38. verbose=False,
  39. java_options="-mx4g",
  40. corenlp_options="",
  41. ):
  42. # find the most recent code and model jar
  43. stanford_jar = max(
  44. find_jar_iter(
  45. self._JAR,
  46. path_to_jar,
  47. env_vars=("STANFORD_PARSER", "STANFORD_CORENLP"),
  48. searchpath=(),
  49. url=_stanford_url,
  50. verbose=verbose,
  51. is_regex=True,
  52. ),
  53. key=lambda model_path: os.path.dirname(model_path),
  54. )
  55. model_jar = max(
  56. find_jar_iter(
  57. self._MODEL_JAR_PATTERN,
  58. path_to_models_jar,
  59. env_vars=("STANFORD_MODELS", "STANFORD_CORENLP"),
  60. searchpath=(),
  61. url=_stanford_url,
  62. verbose=verbose,
  63. is_regex=True,
  64. ),
  65. key=lambda model_path: os.path.dirname(model_path),
  66. )
  67. # self._classpath = (stanford_jar, model_jar)
  68. # Adding logging jar files to classpath
  69. stanford_dir = os.path.split(stanford_jar)[0]
  70. self._classpath = tuple([model_jar] + find_jars_within_path(stanford_dir))
  71. self.model_path = model_path
  72. self._encoding = encoding
  73. self.corenlp_options = corenlp_options
  74. self.java_options = java_options
  75. def _parse_trees_output(self, output_):
  76. res = []
  77. cur_lines = []
  78. cur_trees = []
  79. blank = False
  80. for line in output_.splitlines(False):
  81. if line == "":
  82. if blank:
  83. res.append(iter(cur_trees))
  84. cur_trees = []
  85. blank = False
  86. elif self._DOUBLE_SPACED_OUTPUT:
  87. cur_trees.append(self._make_tree("\n".join(cur_lines)))
  88. cur_lines = []
  89. blank = True
  90. else:
  91. res.append(iter([self._make_tree("\n".join(cur_lines))]))
  92. cur_lines = []
  93. else:
  94. cur_lines.append(line)
  95. blank = False
  96. return iter(res)
  97. def parse_sents(self, sentences, verbose=False):
  98. """
  99. Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
  100. list where each sentence is a list of words.
  101. Each sentence will be automatically tagged with this StanfordParser instance's
  102. tagger.
  103. If whitespaces exists inside a token, then the token will be treated as
  104. separate tokens.
  105. :param sentences: Input sentences to parse
  106. :type sentences: list(list(str))
  107. :rtype: iter(iter(Tree))
  108. """
  109. cmd = [
  110. self._MAIN_CLASS,
  111. "-model",
  112. self.model_path,
  113. "-sentences",
  114. "newline",
  115. "-outputFormat",
  116. self._OUTPUT_FORMAT,
  117. "-tokenized",
  118. "-escaper",
  119. "edu.stanford.nlp.process.PTBEscapingProcessor",
  120. ]
  121. return self._parse_trees_output(
  122. self._execute(
  123. cmd, "\n".join(" ".join(sentence) for sentence in sentences), verbose
  124. )
  125. )
  126. def raw_parse(self, sentence, verbose=False):
  127. """
  128. Use StanfordParser to parse a sentence. Takes a sentence as a string;
  129. before parsing, it will be automatically tokenized and tagged by
  130. the Stanford Parser.
  131. :param sentence: Input sentence to parse
  132. :type sentence: str
  133. :rtype: iter(Tree)
  134. """
  135. return next(self.raw_parse_sents([sentence], verbose))
  136. def raw_parse_sents(self, sentences, verbose=False):
  137. """
  138. Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
  139. list of strings.
  140. Each sentence will be automatically tokenized and tagged by the Stanford Parser.
  141. :param sentences: Input sentences to parse
  142. :type sentences: list(str)
  143. :rtype: iter(iter(Tree))
  144. """
  145. cmd = [
  146. self._MAIN_CLASS,
  147. "-model",
  148. self.model_path,
  149. "-sentences",
  150. "newline",
  151. "-outputFormat",
  152. self._OUTPUT_FORMAT,
  153. ]
  154. return self._parse_trees_output(
  155. self._execute(cmd, "\n".join(sentences), verbose)
  156. )
  157. def tagged_parse(self, sentence, verbose=False):
  158. """
  159. Use StanfordParser to parse a sentence. Takes a sentence as a list of
  160. (word, tag) tuples; the sentence must have already been tokenized and
  161. tagged.
  162. :param sentence: Input sentence to parse
  163. :type sentence: list(tuple(str, str))
  164. :rtype: iter(Tree)
  165. """
  166. return next(self.tagged_parse_sents([sentence], verbose))
  167. def tagged_parse_sents(self, sentences, verbose=False):
  168. """
  169. Use StanfordParser to parse multiple sentences. Takes multiple sentences
  170. where each sentence is a list of (word, tag) tuples.
  171. The sentences must have already been tokenized and tagged.
  172. :param sentences: Input sentences to parse
  173. :type sentences: list(list(tuple(str, str)))
  174. :rtype: iter(iter(Tree))
  175. """
  176. tag_separator = "/"
  177. cmd = [
  178. self._MAIN_CLASS,
  179. "-model",
  180. self.model_path,
  181. "-sentences",
  182. "newline",
  183. "-outputFormat",
  184. self._OUTPUT_FORMAT,
  185. "-tokenized",
  186. "-tagSeparator",
  187. tag_separator,
  188. "-tokenizerFactory",
  189. "edu.stanford.nlp.process.WhitespaceTokenizer",
  190. "-tokenizerMethod",
  191. "newCoreLabelTokenizerFactory",
  192. ]
  193. # We don't need to escape slashes as "splitting is done on the last instance of the character in the token"
  194. return self._parse_trees_output(
  195. self._execute(
  196. cmd,
  197. "\n".join(
  198. " ".join(tag_separator.join(tagged) for tagged in sentence)
  199. for sentence in sentences
  200. ),
  201. verbose,
  202. )
  203. )
  204. def _execute(self, cmd, input_, verbose=False):
  205. encoding = self._encoding
  206. cmd.extend(["-encoding", encoding])
  207. if self.corenlp_options:
  208. cmd.append(self.corenlp_options)
  209. default_options = " ".join(_java_options)
  210. # Configure java.
  211. config_java(options=self.java_options, verbose=verbose)
  212. # Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
  213. with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
  214. # Write the actual sentences to the temporary input file
  215. if isinstance(input_, str) and encoding:
  216. input_ = input_.encode(encoding)
  217. input_file.write(input_)
  218. input_file.flush()
  219. # Run the tagger and get the output.
  220. if self._USE_STDIN:
  221. input_file.seek(0)
  222. stdout, stderr = java(
  223. cmd,
  224. classpath=self._classpath,
  225. stdin=input_file,
  226. stdout=PIPE,
  227. stderr=PIPE,
  228. )
  229. else:
  230. cmd.append(input_file.name)
  231. stdout, stderr = java(
  232. cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE
  233. )
  234. stdout = stdout.replace(b"\xc2\xa0", b" ")
  235. stdout = stdout.replace(b"\x00\xa0", b" ")
  236. stdout = stdout.decode(encoding)
  237. os.unlink(input_file.name)
  238. # Return java configurations to their default values.
  239. config_java(options=default_options, verbose=False)
  240. return stdout
  241. class StanfordParser(GenericStanfordParser):
  242. """
  243. >>> parser=StanfordParser(
  244. ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
  245. ... )
  246. >>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE
  247. [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
  248. Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
  249. Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])]
  250. >>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents((
  251. ... "the quick brown fox jumps over the lazy dog",
  252. ... "the quick grey wolf jumps over the lazy fox"
  253. ... ))], []) # doctest: +NORMALIZE_WHITESPACE
  254. [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
  255. Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
  256. Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP',
  257. [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP',
  258. [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']),
  259. Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])]
  260. >>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents((
  261. ... "I 'm a dog".split(),
  262. ... "This is my friends ' cat ( the tabby )".split(),
  263. ... ))], []) # doctest: +NORMALIZE_WHITESPACE
  264. [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]),
  265. Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP',
  266. [Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']),
  267. Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', [Tree('', []),
  268. Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', [])])])])])])])]
  269. >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents((
  270. ... (
  271. ... ("The", "DT"),
  272. ... ("quick", "JJ"),
  273. ... ("brown", "JJ"),
  274. ... ("fox", "NN"),
  275. ... ("jumped", "VBD"),
  276. ... ("over", "IN"),
  277. ... ("the", "DT"),
  278. ... ("lazy", "JJ"),
  279. ... ("dog", "NN"),
  280. ... (".", "."),
  281. ... ),
  282. ... ))],[]) # doctest: +NORMALIZE_WHITESPACE
  283. [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
  284. Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP',
  285. [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]
  286. """
  287. _OUTPUT_FORMAT = "penn"
  288. def __init__(self, *args, **kwargs):
  289. warnings.warn(
  290. "The StanfordParser will be deprecated\n"
  291. "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.",
  292. DeprecationWarning,
  293. stacklevel=2,
  294. )
  295. super(StanfordParser, self).__init__(*args, **kwargs)
  296. def _make_tree(self, result):
  297. return Tree.fromstring(result)
  298. class StanfordDependencyParser(GenericStanfordParser):
  299. """
  300. >>> dep_parser=StanfordDependencyParser(
  301. ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
  302. ... )
  303. >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
  304. [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])]
  305. >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
  306. [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
  307. ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
  308. ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
  309. ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
  310. >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
  311. ... "The quick brown fox jumps over the lazy dog.",
  312. ... "The quick grey wolf jumps over the lazy fox."
  313. ... ))], []) # doctest: +NORMALIZE_WHITESPACE
  314. [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]),
  315. Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])]
  316. >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
  317. ... "I 'm a dog".split(),
  318. ... "This is my friends ' cat ( the tabby )".split(),
  319. ... ))], []) # doctest: +NORMALIZE_WHITESPACE
  320. [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])]
  321. >>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents((
  322. ... (
  323. ... ("The", "DT"),
  324. ... ("quick", "JJ"),
  325. ... ("brown", "JJ"),
  326. ... ("fox", "NN"),
  327. ... ("jumped", "VBD"),
  328. ... ("over", "IN"),
  329. ... ("the", "DT"),
  330. ... ("lazy", "JJ"),
  331. ... ("dog", "NN"),
  332. ... (".", "."),
  333. ... ),
  334. ... ))],[]) # doctest: +NORMALIZE_WHITESPACE
  335. [[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
  336. ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
  337. ((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
  338. ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
  339. """
  340. _OUTPUT_FORMAT = "conll2007"
  341. def __init__(self, *args, **kwargs):
  342. warnings.warn(
  343. "The StanfordDependencyParser will be deprecated\n"
  344. "Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
  345. DeprecationWarning,
  346. stacklevel=2,
  347. )
  348. super(StanfordDependencyParser, self).__init__(*args, **kwargs)
  349. def _make_tree(self, result):
  350. return DependencyGraph(result, top_relation_label="root")
  351. class StanfordNeuralDependencyParser(GenericStanfordParser):
  352. """
  353. >>> from nltk.parse.stanford import StanfordNeuralDependencyParser
  354. >>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx4g')
  355. >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
  356. [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.'])]
  357. >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
  358. [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det',
  359. (u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'),
  360. u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')),
  361. ((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det',
  362. (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ')), ((u'jumps', u'VBZ'),
  363. u'punct', (u'.', u'.'))]]
  364. >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
  365. ... "The quick brown fox jumps over the lazy dog.",
  366. ... "The quick grey wolf jumps over the lazy fox."
  367. ... ))], []) # doctest: +NORMALIZE_WHITESPACE
  368. [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over',
  369. 'the', 'lazy']), '.']), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']),
  370. Tree('fox', ['over', 'the', 'lazy']), '.'])]
  371. >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
  372. ... "I 'm a dog".split(),
  373. ... "This is my friends ' cat ( the tabby )".split(),
  374. ... ))], []) # doctest: +NORMALIZE_WHITESPACE
  375. [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends',
  376. ['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])]
  377. """
  378. _OUTPUT_FORMAT = "conll"
  379. _MAIN_CLASS = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
  380. _JAR = r"stanford-corenlp-(\d+)(\.(\d+))+\.jar"
  381. _MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)(\.(\d+))+-models\.jar"
  382. _USE_STDIN = True
  383. _DOUBLE_SPACED_OUTPUT = True
  384. def __init__(self, *args, **kwargs):
  385. warnings.warn(
  386. "The StanfordNeuralDependencyParser will be deprecated\n"
  387. "Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
  388. DeprecationWarning,
  389. stacklevel=2,
  390. )
  391. super(StanfordNeuralDependencyParser, self).__init__(*args, **kwargs)
  392. self.corenlp_options += "-annotators tokenize,ssplit,pos,depparse"
  393. def tagged_parse_sents(self, sentences, verbose=False):
  394. """
  395. Currently unimplemented because the neural dependency parser (and
  396. the StanfordCoreNLP pipeline class) doesn't support passing in pre-
  397. tagged tokens.
  398. """
  399. raise NotImplementedError(
  400. "tagged_parse[_sents] is not supported by "
  401. "StanfordNeuralDependencyParser; use "
  402. "parse[_sents] or raw_parse[_sents] instead."
  403. )
  404. def _make_tree(self, result):
  405. return DependencyGraph(result, top_relation_label="ROOT")
  406. @skip("doctests from nltk.parse.stanford are skipped because it's deprecated")
  407. def setup_module(module):
  408. from nose import SkipTest
  409. try:
  410. StanfordParser(
  411. model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
  412. )
  413. StanfordNeuralDependencyParser()
  414. except LookupError:
  415. raise SkipTest(
  416. "doctests from nltk.parse.stanford are skipped because one of the stanford parser or CoreNLP jars doesn't exist"
  417. )