corenlp.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Interface to the CoreNLP REST API.
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Author: Dmitrijs Milajevs <dimazest@gmail.com>
  6. #
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. import re
  10. import json
  11. import time
  12. import socket
  13. from nltk.internals import find_jar_iter, config_java, java, _java_options
  14. from nltk.tag.api import TaggerI
  15. from nltk.parse.api import ParserI
  16. from nltk.tokenize.api import TokenizerI
  17. from nltk.parse.dependencygraph import DependencyGraph
  18. from nltk.tree import Tree
  19. from unittest import skip
  20. _stanford_url = "http://stanfordnlp.github.io/CoreNLP/"
  21. class CoreNLPServerError(EnvironmentError):
  22. """Exceptions associated with the Core NLP server."""
  23. def try_port(port=0):
  24. sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
  25. sock.bind(("", port))
  26. p = sock.getsockname()[1]
  27. sock.close()
  28. return p
  29. class CoreNLPServer(object):
  30. _MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar"
  31. _JAR = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar"
  32. def __init__(
  33. self,
  34. path_to_jar=None,
  35. path_to_models_jar=None,
  36. verbose=False,
  37. java_options=None,
  38. corenlp_options=None,
  39. port=None,
  40. ):
  41. if corenlp_options is None:
  42. corenlp_options = ["-preload", "tokenize,ssplit,pos,lemma,parse,depparse"]
  43. jars = list(
  44. find_jar_iter(
  45. self._JAR,
  46. path_to_jar,
  47. env_vars=("CORENLP",),
  48. searchpath=(),
  49. url=_stanford_url,
  50. verbose=verbose,
  51. is_regex=True,
  52. )
  53. )
  54. # find the most recent code and model jar
  55. stanford_jar = max(jars, key=lambda model_name: re.match(self._JAR, model_name))
  56. if port is None:
  57. try:
  58. port = try_port(9000)
  59. except socket.error:
  60. port = try_port()
  61. corenlp_options.append(str(port))
  62. else:
  63. try_port(port)
  64. self.url = "http://localhost:{}".format(port)
  65. model_jar = max(
  66. find_jar_iter(
  67. self._MODEL_JAR_PATTERN,
  68. path_to_models_jar,
  69. env_vars=("CORENLP_MODELS",),
  70. searchpath=(),
  71. url=_stanford_url,
  72. verbose=verbose,
  73. is_regex=True,
  74. ),
  75. key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name),
  76. )
  77. self.verbose = verbose
  78. self._classpath = stanford_jar, model_jar
  79. self.corenlp_options = corenlp_options
  80. self.java_options = java_options or ["-mx2g"]
  81. def start(self, stdout="devnull", stderr="devnull"):
  82. """ Starts the CoreNLP server
  83. :param stdout, stderr: Specifies where CoreNLP output is redirected. Valid values are 'devnull', 'stdout', 'pipe'
  84. """
  85. import requests
  86. cmd = ["edu.stanford.nlp.pipeline.StanfordCoreNLPServer"]
  87. if self.corenlp_options:
  88. cmd.extend(self.corenlp_options)
  89. # Configure java.
  90. default_options = " ".join(_java_options)
  91. config_java(options=self.java_options, verbose=self.verbose)
  92. try:
  93. self.popen = java(
  94. cmd,
  95. classpath=self._classpath,
  96. blocking=False,
  97. stdout=stdout,
  98. stderr=stderr,
  99. )
  100. finally:
  101. # Return java configurations to their default values.
  102. config_java(options=default_options, verbose=self.verbose)
  103. # Check that the server is istill running.
  104. returncode = self.popen.poll()
  105. if returncode is not None:
  106. _, stderrdata = self.popen.communicate()
  107. raise CoreNLPServerError(
  108. returncode,
  109. "Could not start the server. "
  110. "The error was: {}".format(stderrdata.decode("ascii")),
  111. )
  112. for i in range(30):
  113. try:
  114. response = requests.get(requests.compat.urljoin(self.url, "live"))
  115. except requests.exceptions.ConnectionError:
  116. time.sleep(1)
  117. else:
  118. if response.ok:
  119. break
  120. else:
  121. raise CoreNLPServerError("Could not connect to the server.")
  122. for i in range(60):
  123. try:
  124. response = requests.get(requests.compat.urljoin(self.url, "ready"))
  125. except requests.exceptions.ConnectionError:
  126. time.sleep(1)
  127. else:
  128. if response.ok:
  129. break
  130. else:
  131. raise CoreNLPServerError("The server is not ready.")
  132. def stop(self):
  133. self.popen.terminate()
  134. self.popen.wait()
  135. def __enter__(self):
  136. self.start()
  137. return self
  138. def __exit__(self, exc_type, exc_val, exc_tb):
  139. self.stop()
  140. return False
  141. class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI):
  142. """Interface to the CoreNLP Parser."""
  143. def __init__(self, url="http://localhost:9000", encoding="utf8", tagtype=None):
  144. import requests
  145. self.url = url
  146. self.encoding = encoding
  147. if tagtype not in ["pos", "ner", None]:
  148. raise ValueError("tagtype must be either 'pos', 'ner' or None")
  149. self.tagtype = tagtype
  150. self.session = requests.Session()
  151. def parse_sents(self, sentences, *args, **kwargs):
  152. """Parse multiple sentences.
  153. Takes multiple sentences as a list where each sentence is a list of
  154. words. Each sentence will be automatically tagged with this
  155. CoreNLPParser instance's tagger.
  156. If a whitespace exists inside a token, then the token will be treated as
  157. several tokens.
  158. :param sentences: Input sentences to parse
  159. :type sentences: list(list(str))
  160. :rtype: iter(iter(Tree))
  161. """
  162. # Converting list(list(str)) -> list(str)
  163. sentences = (" ".join(words) for words in sentences)
  164. return self.raw_parse_sents(sentences, *args, **kwargs)
  165. def raw_parse(self, sentence, properties=None, *args, **kwargs):
  166. """Parse a sentence.
  167. Takes a sentence as a string; before parsing, it will be automatically
  168. tokenized and tagged by the CoreNLP Parser.
  169. :param sentence: Input sentence to parse
  170. :type sentence: str
  171. :rtype: iter(Tree)
  172. """
  173. default_properties = {"tokenize.whitespace": "false"}
  174. default_properties.update(properties or {})
  175. return next(
  176. self.raw_parse_sents(
  177. [sentence], properties=default_properties, *args, **kwargs
  178. )
  179. )
  180. def api_call(self, data, properties=None, timeout=60):
  181. default_properties = {
  182. "outputFormat": "json",
  183. "annotators": "tokenize,pos,lemma,ssplit,{parser_annotator}".format(
  184. parser_annotator=self.parser_annotator
  185. ),
  186. }
  187. default_properties.update(properties or {})
  188. response = self.session.post(
  189. self.url,
  190. params={"properties": json.dumps(default_properties)},
  191. data=data.encode(self.encoding),
  192. timeout=timeout,
  193. )
  194. response.raise_for_status()
  195. return response.json()
  196. def raw_parse_sents(
  197. self, sentences, verbose=False, properties=None, *args, **kwargs
  198. ):
  199. """Parse multiple sentences.
  200. Takes multiple sentences as a list of strings. Each sentence will be
  201. automatically tokenized and tagged.
  202. :param sentences: Input sentences to parse.
  203. :type sentences: list(str)
  204. :rtype: iter(iter(Tree))
  205. """
  206. default_properties = {
  207. # Only splits on '\n', never inside the sentence.
  208. "ssplit.eolonly": "true"
  209. }
  210. default_properties.update(properties or {})
  211. """
  212. for sentence in sentences:
  213. parsed_data = self.api_call(sentence, properties=default_properties)
  214. assert len(parsed_data['sentences']) == 1
  215. for parse in parsed_data['sentences']:
  216. tree = self.make_tree(parse)
  217. yield iter([tree])
  218. """
  219. parsed_data = self.api_call("\n".join(sentences), properties=default_properties)
  220. for parsed_sent in parsed_data["sentences"]:
  221. tree = self.make_tree(parsed_sent)
  222. yield iter([tree])
  223. def parse_text(self, text, *args, **kwargs):
  224. """Parse a piece of text.
  225. The text might contain several sentences which will be split by CoreNLP.
  226. :param str text: text to be split.
  227. :returns: an iterable of syntactic structures. # TODO: should it be an iterable of iterables?
  228. """
  229. parsed_data = self.api_call(text, *args, **kwargs)
  230. for parse in parsed_data["sentences"]:
  231. yield self.make_tree(parse)
  232. def tokenize(self, text, properties=None):
  233. """Tokenize a string of text.
  234. >>> parser = CoreNLPParser(url='http://localhost:9000')
  235. >>> text = 'Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\nThanks.'
  236. >>> list(parser.tokenize(text))
  237. ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
  238. >>> s = "The colour of the wall is blue."
  239. >>> list(
  240. ... parser.tokenize(
  241. ... 'The colour of the wall is blue.',
  242. ... properties={'tokenize.options': 'americanize=true'},
  243. ... )
  244. ... )
  245. ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
  246. """
  247. default_properties = {"annotators": "tokenize,ssplit"}
  248. default_properties.update(properties or {})
  249. result = self.api_call(text, properties=default_properties)
  250. for sentence in result["sentences"]:
  251. for token in sentence["tokens"]:
  252. yield token["originalText"] or token["word"]
  253. def tag_sents(self, sentences):
  254. """
  255. Tag multiple sentences.
  256. Takes multiple sentences as a list where each sentence is a list of
  257. tokens.
  258. :param sentences: Input sentences to tag
  259. :type sentences: list(list(str))
  260. :rtype: list(list(tuple(str, str))
  261. """
  262. # Converting list(list(str)) -> list(str)
  263. sentences = (" ".join(words) for words in sentences)
  264. return [sentences[0] for sentences in self.raw_tag_sents(sentences)]
  265. def tag(self, sentence):
  266. """
  267. Tag a list of tokens.
  268. :rtype: list(tuple(str, str))
  269. >>> parser = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
  270. >>> tokens = 'Rami Eid is studying at Stony Brook University in NY'.split()
  271. >>> parser.tag(tokens)
  272. [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'),
  273. ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'O')]
  274. >>> parser = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
  275. >>> tokens = "What is the airspeed of an unladen swallow ?".split()
  276. >>> parser.tag(tokens)
  277. [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'),
  278. ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'),
  279. ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
  280. """
  281. return self.tag_sents([sentence])[0]
  282. def raw_tag_sents(self, sentences):
  283. """
  284. Tag multiple sentences.
  285. Takes multiple sentences as a list where each sentence is a string.
  286. :param sentences: Input sentences to tag
  287. :type sentences: list(str)
  288. :rtype: list(list(list(tuple(str, str)))
  289. """
  290. default_properties = {
  291. "ssplit.isOneSentence": "true",
  292. "annotators": "tokenize,ssplit,",
  293. }
  294. # Supports only 'pos' or 'ner' tags.
  295. assert self.tagtype in ["pos", "ner"]
  296. default_properties["annotators"] += self.tagtype
  297. for sentence in sentences:
  298. tagged_data = self.api_call(sentence, properties=default_properties)
  299. yield [
  300. [
  301. (token["word"], token[self.tagtype])
  302. for token in tagged_sentence["tokens"]
  303. ]
  304. for tagged_sentence in tagged_data["sentences"]
  305. ]
  306. class CoreNLPParser(GenericCoreNLPParser):
  307. """
  308. >>> parser = CoreNLPParser(url='http://localhost:9000')
  309. >>> next(
  310. ... parser.raw_parse('The quick brown fox jumps over the lazy dog.')
  311. ... ).pretty_print() # doctest: +NORMALIZE_WHITESPACE
  312. ROOT
  313. |
  314. S
  315. _______________|__________________________
  316. | VP |
  317. | _________|___ |
  318. | | PP |
  319. | | ________|___ |
  320. NP | | NP |
  321. ____|__________ | | _______|____ |
  322. DT JJ JJ NN VBZ IN DT JJ NN .
  323. | | | | | | | | | |
  324. The quick brown fox jumps over the lazy dog .
  325. >>> (parse_fox, ), (parse_wolf, ) = parser.raw_parse_sents(
  326. ... [
  327. ... 'The quick brown fox jumps over the lazy dog.',
  328. ... 'The quick grey wolf jumps over the lazy fox.',
  329. ... ]
  330. ... )
  331. >>> parse_fox.pretty_print() # doctest: +NORMALIZE_WHITESPACE
  332. ROOT
  333. |
  334. S
  335. _______________|__________________________
  336. | VP |
  337. | _________|___ |
  338. | | PP |
  339. | | ________|___ |
  340. NP | | NP |
  341. ____|__________ | | _______|____ |
  342. DT JJ JJ NN VBZ IN DT JJ NN .
  343. | | | | | | | | | |
  344. The quick brown fox jumps over the lazy dog .
  345. >>> parse_wolf.pretty_print() # doctest: +NORMALIZE_WHITESPACE
  346. ROOT
  347. |
  348. S
  349. _______________|__________________________
  350. | VP |
  351. | _________|___ |
  352. | | PP |
  353. | | ________|___ |
  354. NP | | NP |
  355. ____|_________ | | _______|____ |
  356. DT JJ JJ NN VBZ IN DT JJ NN .
  357. | | | | | | | | | |
  358. The quick grey wolf jumps over the lazy fox .
  359. >>> (parse_dog, ), (parse_friends, ) = parser.parse_sents(
  360. ... [
  361. ... "I 'm a dog".split(),
  362. ... "This is my friends ' cat ( the tabby )".split(),
  363. ... ]
  364. ... )
  365. >>> parse_dog.pretty_print() # doctest: +NORMALIZE_WHITESPACE
  366. ROOT
  367. |
  368. S
  369. _______|____
  370. | VP
  371. | ________|___
  372. NP | NP
  373. | | ___|___
  374. PRP VBP DT NN
  375. | | | |
  376. I 'm a dog
  377. >>> parse_friends.pretty_print() # doctest: +NORMALIZE_WHITESPACE
  378. ROOT
  379. |
  380. S
  381. ____|___________
  382. | VP
  383. | ___________|_____________
  384. | | NP
  385. | | _______|_________
  386. | | NP PRN
  387. | | _____|_______ ____|______________
  388. NP | NP | | NP |
  389. | | ______|_________ | | ___|____ |
  390. DT VBZ PRP$ NNS POS NN -LRB- DT NN -RRB-
  391. | | | | | | | | | |
  392. This is my friends ' cat -LRB- the tabby -RRB-
  393. >>> parse_john, parse_mary, = parser.parse_text(
  394. ... 'John loves Mary. Mary walks.'
  395. ... )
  396. >>> parse_john.pretty_print() # doctest: +NORMALIZE_WHITESPACE
  397. ROOT
  398. |
  399. S
  400. _____|_____________
  401. | VP |
  402. | ____|___ |
  403. NP | NP |
  404. | | | |
  405. NNP VBZ NNP .
  406. | | | |
  407. John loves Mary .
  408. >>> parse_mary.pretty_print() # doctest: +NORMALIZE_WHITESPACE
  409. ROOT
  410. |
  411. S
  412. _____|____
  413. NP VP |
  414. | | |
  415. NNP VBZ .
  416. | | |
  417. Mary walks .
  418. Special cases
  419. -------------
  420. >>> next(
  421. ... parser.raw_parse(
  422. ... 'NASIRIYA, Iraq—Iraqi doctors who treated former prisoner of war '
  423. ... 'Jessica Lynch have angrily dismissed claims made in her biography '
  424. ... 'that she was raped by her Iraqi captors.'
  425. ... )
  426. ... ).height()
  427. 20
  428. >>> next(
  429. ... parser.raw_parse(
  430. ... "The broader Standard & Poor's 500 Index <.SPX> was 0.46 points lower, or "
  431. ... '0.05 percent, at 997.02.'
  432. ... )
  433. ... ).height()
  434. 9
  435. """
  436. _OUTPUT_FORMAT = "penn"
  437. parser_annotator = "parse"
  438. def make_tree(self, result):
  439. return Tree.fromstring(result["parse"])
  440. class CoreNLPDependencyParser(GenericCoreNLPParser):
  441. """Dependency parser.
  442. >>> dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
  443. >>> parse, = dep_parser.raw_parse(
  444. ... 'The quick brown fox jumps over the lazy dog.'
  445. ... )
  446. >>> print(parse.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
  447. The DT 4 det
  448. quick JJ 4 amod
  449. brown JJ 4 amod
  450. fox NN 5 nsubj
  451. jumps VBZ 0 ROOT
  452. over IN 9 case
  453. the DT 9 det
  454. lazy JJ 9 amod
  455. dog NN 5 nmod
  456. . . 5 punct
  457. >>> print(parse.tree()) # doctest: +NORMALIZE_WHITESPACE
  458. (jumps (fox The quick brown) (dog over the lazy) .)
  459. >>> for governor, dep, dependent in parse.triples():
  460. ... print(governor, dep, dependent) # doctest: +NORMALIZE_WHITESPACE
  461. ('jumps', 'VBZ') nsubj ('fox', 'NN')
  462. ('fox', 'NN') det ('The', 'DT')
  463. ('fox', 'NN') amod ('quick', 'JJ')
  464. ('fox', 'NN') amod ('brown', 'JJ')
  465. ('jumps', 'VBZ') nmod ('dog', 'NN')
  466. ('dog', 'NN') case ('over', 'IN')
  467. ('dog', 'NN') det ('the', 'DT')
  468. ('dog', 'NN') amod ('lazy', 'JJ')
  469. ('jumps', 'VBZ') punct ('.', '.')
  470. >>> (parse_fox, ), (parse_dog, ) = dep_parser.raw_parse_sents(
  471. ... [
  472. ... 'The quick brown fox jumps over the lazy dog.',
  473. ... 'The quick grey wolf jumps over the lazy fox.',
  474. ... ]
  475. ... )
  476. >>> print(parse_fox.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
  477. The DT 4 det
  478. quick JJ 4 amod
  479. brown JJ 4 amod
  480. fox NN 5 nsubj
  481. jumps VBZ 0 ROOT
  482. over IN 9 case
  483. the DT 9 det
  484. lazy JJ 9 amod
  485. dog NN 5 nmod
  486. . . 5 punct
  487. >>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
  488. The DT 4 det
  489. quick JJ 4 amod
  490. grey JJ 4 amod
  491. wolf NN 5 nsubj
  492. jumps VBZ 0 ROOT
  493. over IN 9 case
  494. the DT 9 det
  495. lazy JJ 9 amod
  496. fox NN 5 nmod
  497. . . 5 punct
  498. >>> (parse_dog, ), (parse_friends, ) = dep_parser.parse_sents(
  499. ... [
  500. ... "I 'm a dog".split(),
  501. ... "This is my friends ' cat ( the tabby )".split(),
  502. ... ]
  503. ... )
  504. >>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
  505. I PRP 4 nsubj
  506. 'm VBP 4 cop
  507. a DT 4 det
  508. dog NN 0 ROOT
  509. >>> print(parse_friends.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
  510. This DT 6 nsubj
  511. is VBZ 6 cop
  512. my PRP$ 4 nmod:poss
  513. friends NNS 6 nmod:poss
  514. ' POS 4 case
  515. cat NN 0 ROOT
  516. -LRB- -LRB- 9 punct
  517. the DT 9 det
  518. tabby NN 6 appos
  519. -RRB- -RRB- 9 punct
  520. >>> parse_john, parse_mary, = dep_parser.parse_text(
  521. ... 'John loves Mary. Mary walks.'
  522. ... )
  523. >>> print(parse_john.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
  524. John NNP 2 nsubj
  525. loves VBZ 0 ROOT
  526. Mary NNP 2 dobj
  527. . . 2 punct
  528. >>> print(parse_mary.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
  529. Mary NNP 2 nsubj
  530. walks VBZ 0 ROOT
  531. . . 2 punct
  532. Special cases
  533. -------------
  534. Non-breaking space inside of a token.
  535. >>> len(
  536. ... next(
  537. ... dep_parser.raw_parse(
  538. ... 'Anhalt said children typically treat a 20-ounce soda bottle as one '
  539. ... 'serving, while it actually contains 2 1/2 servings.'
  540. ... )
  541. ... ).nodes
  542. ... )
  543. 21
  544. Phone numbers.
  545. >>> len(
  546. ... next(
  547. ... dep_parser.raw_parse('This is not going to crash: 01 111 555.')
  548. ... ).nodes
  549. ... )
  550. 10
  551. >>> print(
  552. ... next(
  553. ... dep_parser.raw_parse('The underscore _ should not simply disappear.')
  554. ... ).to_conll(4)
  555. ... ) # doctest: +NORMALIZE_WHITESPACE
  556. The DT 3 det
  557. underscore VBP 3 amod
  558. _ NN 7 nsubj
  559. should MD 7 aux
  560. not RB 7 neg
  561. simply RB 7 advmod
  562. disappear VB 0 ROOT
  563. . . 7 punct
  564. >>> print(
  565. ... '\\n'.join(
  566. ... next(
  567. ... dep_parser.raw_parse(
  568. ... 'for all of its insights into the dream world of teen life , and its electronic expression through '
  569. ... 'cyber culture , the film gives no quarter to anyone seeking to pull a cohesive story out of its 2 '
  570. ... '1/2-hour running time .'
  571. ... )
  572. ... ).to_conll(4).split('\\n')[-8:]
  573. ... )
  574. ... )
  575. its PRP$ 40 nmod:poss
  576. 2 1/2 CD 40 nummod
  577. - : 40 punct
  578. hour NN 31 nmod
  579. running VBG 42 amod
  580. time NN 40 dep
  581. . . 24 punct
  582. <BLANKLINE>
  583. """
  584. _OUTPUT_FORMAT = "conll2007"
  585. parser_annotator = "depparse"
  586. def make_tree(self, result):
  587. return DependencyGraph(
  588. (
  589. " ".join(n_items[1:]) # NLTK expects an iterable of strings...
  590. for n_items in sorted(transform(result))
  591. ),
  592. cell_separator=" ", # To make sure that a non-breaking space is kept inside of a token.
  593. )
  594. def transform(sentence):
  595. for dependency in sentence["basicDependencies"]:
  596. dependent_index = dependency["dependent"]
  597. token = sentence["tokens"][dependent_index - 1]
  598. # Return values that we don't know as '_'. Also, consider tag and ctag
  599. # to be equal.
  600. yield (
  601. dependent_index,
  602. "_",
  603. token["word"],
  604. token["lemma"],
  605. token["pos"],
  606. token["pos"],
  607. "_",
  608. str(dependency["governor"]),
  609. dependency["dep"],
  610. "_",
  611. "_",
  612. )
  613. @skip("Skipping all CoreNLP tests.")
  614. def setup_module(module):
  615. from nose import SkipTest
  616. global server
  617. try:
  618. server = CoreNLPServer(port=9000)
  619. except LookupError as e:
  620. raise SkipTest("Could not instantiate CoreNLPServer.")
  621. try:
  622. server.start()
  623. except CoreNLPServerError as e:
  624. raise SkipTest(
  625. "Skipping CoreNLP tests because the server could not be started. "
  626. "Make sure that the 9000 port is free. "
  627. "{}".format(e.strerror)
  628. )
  629. @skip("Skipping all CoreNLP tests.")
  630. def teardown_module(module):
  631. server.stop()