malt.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Interface to MaltParser
  3. #
  4. # Author: Dan Garrette <dhgarrette@gmail.com>
  5. # Contributor: Liling Tan, Mustufain, osamamukhtar11
  6. #
  7. # Copyright (C) 2001-2020 NLTK Project
  8. # URL: <http://nltk.org/>
  9. # For license information, see LICENSE.TXT
  10. import os
  11. import sys
  12. import tempfile
  13. import subprocess
  14. import inspect
  15. from nltk.data import ZipFilePathPointer
  16. from nltk.internals import find_dir, find_file, find_jars_within_path
  17. from nltk.parse.api import ParserI
  18. from nltk.parse.dependencygraph import DependencyGraph
  19. from nltk.parse.util import taggedsents_to_conll
  20. def malt_regex_tagger():
  21. from nltk.tag import RegexpTagger
  22. _tagger = RegexpTagger(
  23. [
  24. (r"\.$", "."),
  25. (r"\,$", ","),
  26. (r"\?$", "?"), # fullstop, comma, Qmark
  27. (r"\($", "("),
  28. (r"\)$", ")"), # round brackets
  29. (r"\[$", "["),
  30. (r"\]$", "]"), # square brackets
  31. (r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers
  32. (r"(The|the|A|a|An|an)$", "DT"), # articles
  33. (r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"), # pronouns
  34. (r"(His|his|Her|her|Its|its)$", "PRP$"), # possesive
  35. (r"(my|Your|your|Yours|yours)$", "PRP$"), # possesive
  36. (r"(on|On|in|In|at|At|since|Since)$", "IN"), # time prepopsitions
  37. (r"(for|For|ago|Ago|before|Before)$", "IN"), # time prepopsitions
  38. (r"(till|Till|until|Until)$", "IN"), # time prepopsitions
  39. (r"(by|By|beside|Beside)$", "IN"), # space prepopsitions
  40. (r"(under|Under|below|Below)$", "IN"), # space prepopsitions
  41. (r"(over|Over|above|Above)$", "IN"), # space prepopsitions
  42. (r"(across|Across|through|Through)$", "IN"), # space prepopsitions
  43. (r"(into|Into|towards|Towards)$", "IN"), # space prepopsitions
  44. (r"(onto|Onto|from|From)$", "IN"), # space prepopsitions
  45. (r".*able$", "JJ"), # adjectives
  46. (r".*ness$", "NN"), # nouns formed from adjectives
  47. (r".*ly$", "RB"), # adverbs
  48. (r".*s$", "NNS"), # plural nouns
  49. (r".*ing$", "VBG"), # gerunds
  50. (r".*ed$", "VBD"), # past tense verbs
  51. (r".*", "NN"), # nouns (default)
  52. ]
  53. )
  54. return _tagger.tag
  55. def find_maltparser(parser_dirname):
  56. """
  57. A module to find MaltParser .jar file and its dependencies.
  58. """
  59. if os.path.exists(parser_dirname): # If a full path is given.
  60. _malt_dir = parser_dirname
  61. else: # Try to find path to maltparser directory in environment variables.
  62. _malt_dir = find_dir(parser_dirname, env_vars=("MALT_PARSER",))
  63. # Checks that that the found directory contains all the necessary .jar
  64. malt_dependencies = ["", "", ""]
  65. _malt_jars = set(find_jars_within_path(_malt_dir))
  66. _jars = set(os.path.split(jar)[1] for jar in _malt_jars)
  67. malt_dependencies = set(["log4j.jar", "libsvm.jar", "liblinear-1.8.jar"])
  68. assert malt_dependencies.issubset(_jars)
  69. assert any(
  70. filter(lambda i: i.startswith("maltparser-") and i.endswith(".jar"), _jars)
  71. )
  72. return list(_malt_jars)
  73. def find_malt_model(model_filename):
  74. """
  75. A module to find pre-trained MaltParser model.
  76. """
  77. if model_filename is None:
  78. return "malt_temp.mco"
  79. elif os.path.exists(model_filename): # If a full path is given.
  80. return model_filename
  81. else: # Try to find path to malt model in environment variables.
  82. return find_file(model_filename, env_vars=("MALT_MODEL",), verbose=False)
  83. class MaltParser(ParserI):
  84. """
  85. A class for dependency parsing with MaltParser. The input is the paths to:
  86. - a maltparser directory
  87. - (optionally) the path to a pre-trained MaltParser .mco model file
  88. - (optionally) the tagger to use for POS tagging before parsing
  89. - (optionally) additional Java arguments
  90. Example:
  91. >>> from nltk.parse import malt
  92. >>> # With MALT_PARSER and MALT_MODEL environment set.
  93. >>> mp = malt.MaltParser('maltparser-1.7.2', 'engmalt.linear-1.7.mco') # doctest: +SKIP
  94. >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
  95. (shot I (elephant an) (in (pajamas my)) .)
  96. >>> # Without MALT_PARSER and MALT_MODEL environment.
  97. >>> mp = malt.MaltParser('/home/user/maltparser-1.7.2/', '/home/user/engmalt.linear-1.7.mco') # doctest: +SKIP
  98. >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
  99. (shot I (elephant an) (in (pajamas my)) .)
  100. """
  101. def __init__(
  102. self,
  103. parser_dirname,
  104. model_filename=None,
  105. tagger=None,
  106. additional_java_args=None,
  107. ):
  108. """
  109. An interface for parsing with the Malt Parser.
  110. :param parser_dirname: The path to the maltparser directory that
  111. contains the maltparser-1.x.jar
  112. :type parser_dirname: str
  113. :param model_filename: The name of the pre-trained model with .mco file
  114. extension. If provided, training will not be required.
  115. (see http://www.maltparser.org/mco/mco.html and
  116. see http://www.patful.com/chalk/node/185)
  117. :type model_filename: str
  118. :param tagger: The tagger used to POS tag the raw string before
  119. formatting to CONLL format. It should behave like `nltk.pos_tag`
  120. :type tagger: function
  121. :param additional_java_args: This is the additional Java arguments that
  122. one can use when calling Maltparser, usually this is the heapsize
  123. limits, e.g. `additional_java_args=['-Xmx1024m']`
  124. (see http://goo.gl/mpDBvQ)
  125. :type additional_java_args: list
  126. """
  127. # Find all the necessary jar files for MaltParser.
  128. self.malt_jars = find_maltparser(parser_dirname)
  129. # Initialize additional java arguments.
  130. self.additional_java_args = (
  131. additional_java_args if additional_java_args is not None else []
  132. )
  133. # Initialize model.
  134. self.model = find_malt_model(model_filename)
  135. self._trained = self.model != "malt_temp.mco"
  136. # Set the working_dir parameters i.e. `-w` from MaltParser's option.
  137. self.working_dir = tempfile.gettempdir()
  138. # Initialize POS tagger.
  139. self.tagger = tagger if tagger is not None else malt_regex_tagger()
  140. def parse_tagged_sents(self, sentences, verbose=False, top_relation_label="null"):
  141. """
  142. Use MaltParser to parse multiple POS tagged sentences. Takes multiple
  143. sentences where each sentence is a list of (word, tag) tuples.
  144. The sentences must have already been tokenized and tagged.
  145. :param sentences: Input sentences to parse
  146. :type sentence: list(list(tuple(str, str)))
  147. :return: iter(iter(``DependencyGraph``)) the dependency graph
  148. representation of each sentence
  149. """
  150. if not self._trained:
  151. raise Exception("Parser has not been trained. Call train() first.")
  152. with tempfile.NamedTemporaryFile(
  153. prefix="malt_input.conll.", dir=self.working_dir, mode="w", delete=False
  154. ) as input_file:
  155. with tempfile.NamedTemporaryFile(
  156. prefix="malt_output.conll.",
  157. dir=self.working_dir,
  158. mode="w",
  159. delete=False,
  160. ) as output_file:
  161. # Convert list of sentences to CONLL format.
  162. for line in taggedsents_to_conll(sentences):
  163. input_file.write(str(line))
  164. input_file.close()
  165. # Generate command to run maltparser.
  166. cmd = self.generate_malt_command(
  167. input_file.name, output_file.name, mode="parse"
  168. )
  169. # This is a maltparser quirk, it needs to be run
  170. # where the model file is. otherwise it goes into an awkward
  171. # missing .jars or strange -w working_dir problem.
  172. _current_path = os.getcwd() # Remembers the current path.
  173. try: # Change to modelfile path
  174. os.chdir(os.path.split(self.model)[0])
  175. except:
  176. pass
  177. ret = self._execute(cmd, verbose) # Run command.
  178. os.chdir(_current_path) # Change back to current path.
  179. if ret != 0:
  180. raise Exception(
  181. "MaltParser parsing (%s) failed with exit "
  182. "code %d" % (" ".join(cmd), ret)
  183. )
  184. # Must return iter(iter(Tree))
  185. with open(output_file.name) as infile:
  186. for tree_str in infile.read().split("\n\n"):
  187. yield (
  188. iter(
  189. [
  190. DependencyGraph(
  191. tree_str, top_relation_label=top_relation_label
  192. )
  193. ]
  194. )
  195. )
  196. os.remove(input_file.name)
  197. os.remove(output_file.name)
  198. def parse_sents(self, sentences, verbose=False, top_relation_label="null"):
  199. """
  200. Use MaltParser to parse multiple sentences.
  201. Takes a list of sentences, where each sentence is a list of words.
  202. Each sentence will be automatically tagged with this
  203. MaltParser instance's tagger.
  204. :param sentences: Input sentences to parse
  205. :type sentence: list(list(str))
  206. :return: iter(DependencyGraph)
  207. """
  208. tagged_sentences = (self.tagger(sentence) for sentence in sentences)
  209. return self.parse_tagged_sents(
  210. tagged_sentences, verbose, top_relation_label=top_relation_label
  211. )
  212. def generate_malt_command(self, inputfilename, outputfilename=None, mode=None):
  213. """
  214. This function generates the maltparser command use at the terminal.
  215. :param inputfilename: path to the input file
  216. :type inputfilename: str
  217. :param outputfilename: path to the output file
  218. :type outputfilename: str
  219. """
  220. cmd = ["java"]
  221. cmd += self.additional_java_args # Adds additional java arguments
  222. # Joins classpaths with ";" if on Windows and on Linux/Mac use ":"
  223. classpaths_separator = ";" if sys.platform.startswith("win") else ":"
  224. cmd += [
  225. "-cp",
  226. classpaths_separator.join(self.malt_jars),
  227. ] # Adds classpaths for jars
  228. cmd += ["org.maltparser.Malt"] # Adds the main function.
  229. # Adds the model file.
  230. if os.path.exists(self.model): # when parsing
  231. cmd += ["-c", os.path.split(self.model)[-1]]
  232. else: # when learning
  233. cmd += ["-c", self.model]
  234. cmd += ["-i", inputfilename]
  235. if mode == "parse":
  236. cmd += ["-o", outputfilename]
  237. cmd += ["-m", mode] # mode use to generate parses.
  238. return cmd
  239. @staticmethod
  240. def _execute(cmd, verbose=False):
  241. output = None if verbose else subprocess.PIPE
  242. p = subprocess.Popen(cmd, stdout=output, stderr=output)
  243. return p.wait()
  244. def train(self, depgraphs, verbose=False):
  245. """
  246. Train MaltParser from a list of ``DependencyGraph`` objects
  247. :param depgraphs: list of ``DependencyGraph`` objects for training input data
  248. :type depgraphs: DependencyGraph
  249. """
  250. # Write the conll_str to malt_train.conll file in /tmp/
  251. with tempfile.NamedTemporaryFile(
  252. prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
  253. ) as input_file:
  254. input_str = "\n".join(dg.to_conll(10) for dg in depgraphs)
  255. input_file.write(str(input_str))
  256. # Trains the model with the malt_train.conll
  257. self.train_from_file(input_file.name, verbose=verbose)
  258. # Removes the malt_train.conll once training finishes.
  259. os.remove(input_file.name)
  260. def train_from_file(self, conll_file, verbose=False):
  261. """
  262. Train MaltParser from a file
  263. :param conll_file: str for the filename of the training input data
  264. :type conll_file: str
  265. """
  266. # If conll_file is a ZipFilePathPointer,
  267. # then we need to do some extra massaging
  268. if isinstance(conll_file, ZipFilePathPointer):
  269. with tempfile.NamedTemporaryFile(
  270. prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
  271. ) as input_file:
  272. with conll_file.open() as conll_input_file:
  273. conll_str = conll_input_file.read()
  274. input_file.write(str(conll_str))
  275. return self.train_from_file(input_file.name, verbose=verbose)
  276. # Generate command to run maltparser.
  277. cmd = self.generate_malt_command(conll_file, mode="learn")
  278. ret = self._execute(cmd, verbose)
  279. if ret != 0:
  280. raise Exception(
  281. "MaltParser training (%s) failed with exit "
  282. "code %d" % (" ".join(cmd), ret)
  283. )
  284. self._trained = True
  285. if __name__ == '__main__':
  286. """
  287. A demonstration function to show how NLTK users can use the malt parser API.
  288. >>> from nltk import pos_tag
  289. >>> assert 'MALT_PARSER' in os.environ, str(
  290. ... "Please set MALT_PARSER in your global environment, e.g.:\n"
  291. ... "$ export MALT_PARSER='/home/user/maltparser-1.7.2/'")
  292. >>>
  293. >>> assert 'MALT_MODEL' in os.environ, str(
  294. ... "Please set MALT_MODEL in your global environment, e.g.:\n"
  295. ... "$ export MALT_MODEL='/home/user/engmalt.linear-1.7.mco'")
  296. >>>
  297. >>> _dg1_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n"
  298. ... "2 sees _ VB _ _ 0 ROOT _ _\n"
  299. ... "3 a _ DT _ _ 4 SPEC _ _\n"
  300. ... "4 dog _ NN _ _ 2 OBJ _ _\n"
  301. ... "5 . _ . _ _ 2 PUNCT _ _\n")
  302. >>>
  303. >>>
  304. >>> _dg2_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n"
  305. ... "2 walks _ VB _ _ 0 ROOT _ _\n"
  306. ... "3 . _ . _ _ 2 PUNCT _ _\n")
  307. >>> dg1 = DependencyGraph(_dg1_str)
  308. >>> dg2 = DependencyGraph(_dg2_str)
  309. >>> # Initialize a MaltParser object
  310. >>> parser_dirname = 'maltparser-1.7.2'
  311. >>> mp = MaltParser(parser_dirname=parser_dirname)
  312. >>>
  313. >>> # Trains a model.
  314. >>> mp.train([dg1,dg2], verbose=False)
  315. >>> sent1 = ['John','sees','Mary', '.']
  316. >>> sent2 = ['John', 'walks', 'a', 'dog', '.']
  317. >>>
  318. >>> # Parse a single sentence.
  319. >>> parsed_sent1 = mp.parse_one(sent1)
  320. >>> parsed_sent2 = mp.parse_one(sent2)
  321. >>> print(parsed_sent1.tree())
  322. (sees John Mary .)
  323. >>> print(parsed_sent2.tree())
  324. (walks John (dog a) .)
  325. >>>
  326. >>> # Parsing multiple sentences.
  327. >>> sentences = [sent1,sent2]
  328. >>> parsed_sents = mp.parse_sents(sentences)
  329. >>> print(next(next(parsed_sents)).tree())
  330. (sees John Mary .)
  331. >>> print(next(next(parsed_sents)).tree())
  332. (walks John (dog a) .)
  333. >>>
  334. >>> # Initialize a MaltParser object with an English pre-trained model.
  335. >>> parser_dirname = 'maltparser-1.7.2'
  336. >>> model_name = 'engmalt.linear-1.7.mco'
  337. >>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag)
  338. >>> sent1 = 'I shot an elephant in my pajamas .'.split()
  339. >>> sent2 = 'Time flies like banana .'.split()
  340. >>> # Parse a single sentence.
  341. >>> print(mp.parse_one(sent1).tree())
  342. (shot I (elephant an) (in (pajamas my)) .)
  343. # Parsing multiple sentences
  344. >>> sentences = [sent1,sent2]
  345. >>> parsed_sents = mp.parse_sents(sentences)
  346. >>> print(next(next(parsed_sents)).tree())
  347. (shot I (elephant an) (in (pajamas my)) .)
  348. >>> print(next(next(parsed_sents)).tree())
  349. (flies Time (like banana) .)
  350. """
  351. import doctest
  352. doctest.testmod()