megam.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. # Natural Language Toolkit: Interface to Megam Classifier
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. A set of functions used to interface with the external megam_ maxent
  9. optimization package. Before megam can be used, you should tell NLTK where it
  10. can find the megam binary, using the ``config_megam()`` function. Typical
  11. usage:
  12. >>> from nltk.classify import megam
  13. >>> megam.config_megam() # pass path to megam if not found in PATH # doctest: +SKIP
  14. [Found megam: ...]
  15. Use with MaxentClassifier. Example below, see MaxentClassifier documentation
  16. for details.
  17. nltk.classify.MaxentClassifier.train(corpus, 'megam')
  18. .. _megam: http://www.umiacs.umd.edu/~hal/megam/index.html
  19. """
  20. import subprocess
  21. from nltk.internals import find_binary
  22. try:
  23. import numpy
  24. except ImportError:
  25. numpy = None
  26. ######################################################################
  27. # { Configuration
  28. ######################################################################
  29. _megam_bin = None
  30. def config_megam(bin=None):
  31. """
  32. Configure NLTK's interface to the ``megam`` maxent optimization
  33. package.
  34. :param bin: The full path to the ``megam`` binary. If not specified,
  35. then nltk will search the system for a ``megam`` binary; and if
  36. one is not found, it will raise a ``LookupError`` exception.
  37. :type bin: str
  38. """
  39. global _megam_bin
  40. _megam_bin = find_binary(
  41. "megam",
  42. bin,
  43. env_vars=["MEGAM"],
  44. binary_names=["megam.opt", "megam", "megam_686", "megam_i686.opt"],
  45. url="http://www.umiacs.umd.edu/~hal/megam/index.html",
  46. )
  47. ######################################################################
  48. # { Megam Interface Functions
  49. ######################################################################
  50. def write_megam_file(train_toks, encoding, stream, bernoulli=True, explicit=True):
  51. """
  52. Generate an input file for ``megam`` based on the given corpus of
  53. classified tokens.
  54. :type train_toks: list(tuple(dict, str))
  55. :param train_toks: Training data, represented as a list of
  56. pairs, the first member of which is a feature dictionary,
  57. and the second of which is a classification label.
  58. :type encoding: MaxentFeatureEncodingI
  59. :param encoding: A feature encoding, used to convert featuresets
  60. into feature vectors. May optionally implement a cost() method
  61. in order to assign different costs to different class predictions.
  62. :type stream: stream
  63. :param stream: The stream to which the megam input file should be
  64. written.
  65. :param bernoulli: If true, then use the 'bernoulli' format. I.e.,
  66. all joint features have binary values, and are listed iff they
  67. are true. Otherwise, list feature values explicitly. If
  68. ``bernoulli=False``, then you must call ``megam`` with the
  69. ``-fvals`` option.
  70. :param explicit: If true, then use the 'explicit' format. I.e.,
  71. list the features that would fire for any of the possible
  72. labels, for each token. If ``explicit=True``, then you must
  73. call ``megam`` with the ``-explicit`` option.
  74. """
  75. # Look up the set of labels.
  76. labels = encoding.labels()
  77. labelnum = dict((label, i) for (i, label) in enumerate(labels))
  78. # Write the file, which contains one line per instance.
  79. for featureset, label in train_toks:
  80. # First, the instance number (or, in the weighted multiclass case, the cost of each label).
  81. if hasattr(encoding, "cost"):
  82. stream.write(
  83. ":".join(str(encoding.cost(featureset, label, l)) for l in labels)
  84. )
  85. else:
  86. stream.write("%d" % labelnum[label])
  87. # For implicit file formats, just list the features that fire
  88. # for this instance's actual label.
  89. if not explicit:
  90. _write_megam_features(encoding.encode(featureset, label), stream, bernoulli)
  91. # For explicit formats, list the features that would fire for
  92. # any of the possible labels.
  93. else:
  94. for l in labels:
  95. stream.write(" #")
  96. _write_megam_features(encoding.encode(featureset, l), stream, bernoulli)
  97. # End of the instance.
  98. stream.write("\n")
  99. def parse_megam_weights(s, features_count, explicit=True):
  100. """
  101. Given the stdout output generated by ``megam`` when training a
  102. model, return a ``numpy`` array containing the corresponding weight
  103. vector. This function does not currently handle bias features.
  104. """
  105. if numpy is None:
  106. raise ValueError("This function requires that numpy be installed")
  107. assert explicit, "non-explicit not supported yet"
  108. lines = s.strip().split("\n")
  109. weights = numpy.zeros(features_count, "d")
  110. for line in lines:
  111. if line.strip():
  112. fid, weight = line.split()
  113. weights[int(fid)] = float(weight)
  114. return weights
  115. def _write_megam_features(vector, stream, bernoulli):
  116. if not vector:
  117. raise ValueError(
  118. "MEGAM classifier requires the use of an " "always-on feature."
  119. )
  120. for (fid, fval) in vector:
  121. if bernoulli:
  122. if fval == 1:
  123. stream.write(" %s" % fid)
  124. elif fval != 0:
  125. raise ValueError(
  126. "If bernoulli=True, then all" "features must be binary."
  127. )
  128. else:
  129. stream.write(" %s %s" % (fid, fval))
  130. def call_megam(args):
  131. """
  132. Call the ``megam`` binary with the given arguments.
  133. """
  134. if isinstance(args, str):
  135. raise TypeError("args should be a list of strings")
  136. if _megam_bin is None:
  137. config_megam()
  138. # Call megam via a subprocess
  139. cmd = [_megam_bin] + args
  140. p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
  141. (stdout, stderr) = p.communicate()
  142. # Check the return code.
  143. if p.returncode != 0:
  144. print()
  145. print(stderr)
  146. raise OSError("megam command failed!")
  147. if isinstance(stdout, str):
  148. return stdout
  149. else:
  150. return stdout.decode("utf-8")