maxent.py 58 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574
  1. # Natural Language Toolkit: Maximum Entropy Classifiers
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # Dmitry Chichkov <dchichkov@gmail.com> (TypedMaxentFeatureEncoding)
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. A classifier model based on maximum entropy modeling framework. This
  10. framework considers all of the probability distributions that are
  11. empirically consistent with the training data; and chooses the
  12. distribution with the highest entropy. A probability distribution is
  13. "empirically consistent" with a set of training data if its estimated
  14. frequency with which a class and a feature vector value co-occur is
  15. equal to the actual frequency in the data.
  16. Terminology: 'feature'
  17. ======================
  18. The term *feature* is usually used to refer to some property of an
  19. unlabeled token. For example, when performing word sense
  20. disambiguation, we might define a ``'prevword'`` feature whose value is
  21. the word preceding the target word. However, in the context of
  22. maxent modeling, the term *feature* is typically used to refer to a
  23. property of a "labeled" token. In order to prevent confusion, we
  24. will introduce two distinct terms to disambiguate these two different
  25. concepts:
  26. - An "input-feature" is a property of an unlabeled token.
  27. - A "joint-feature" is a property of a labeled token.
  28. In the rest of the ``nltk.classify`` module, the term "features" is
  29. used to refer to what we will call "input-features" in this module.
  30. In literature that describes and discusses maximum entropy models,
  31. input-features are typically called "contexts", and joint-features
  32. are simply referred to as "features".
  33. Converting Input-Features to Joint-Features
  34. -------------------------------------------
  35. In maximum entropy models, joint-features are required to have numeric
  36. values. Typically, each input-feature ``input_feat`` is mapped to a
  37. set of joint-features of the form:
  38. | joint_feat(token, label) = { 1 if input_feat(token) == feat_val
  39. | { and label == some_label
  40. | {
  41. | { 0 otherwise
  42. For all values of ``feat_val`` and ``some_label``. This mapping is
  43. performed by classes that implement the ``MaxentFeatureEncodingI``
  44. interface.
  45. """
  46. try:
  47. import numpy
  48. except ImportError:
  49. pass
  50. import tempfile
  51. import os
  52. from collections import defaultdict
  53. from nltk.data import gzip_open_unicode
  54. from nltk.util import OrderedDict
  55. from nltk.probability import DictionaryProbDist
  56. from nltk.classify.api import ClassifierI
  57. from nltk.classify.util import CutoffChecker, accuracy, log_likelihood
  58. from nltk.classify.megam import call_megam, write_megam_file, parse_megam_weights
  59. from nltk.classify.tadm import call_tadm, write_tadm_file, parse_tadm_weights
  60. __docformat__ = "epytext en"
  61. ######################################################################
  62. # { Classifier Model
  63. ######################################################################
  64. class MaxentClassifier(ClassifierI):
  65. """
  66. A maximum entropy classifier (also known as a "conditional
  67. exponential classifier"). This classifier is parameterized by a
  68. set of "weights", which are used to combine the joint-features
  69. that are generated from a featureset by an "encoding". In
  70. particular, the encoding maps each ``(featureset, label)`` pair to
  71. a vector. The probability of each label is then computed using
  72. the following equation::
  73. dotprod(weights, encode(fs,label))
  74. prob(fs|label) = ---------------------------------------------------
  75. sum(dotprod(weights, encode(fs,l)) for l in labels)
  76. Where ``dotprod`` is the dot product::
  77. dotprod(a,b) = sum(x*y for (x,y) in zip(a,b))
  78. """
  79. def __init__(self, encoding, weights, logarithmic=True):
  80. """
  81. Construct a new maxent classifier model. Typically, new
  82. classifier models are created using the ``train()`` method.
  83. :type encoding: MaxentFeatureEncodingI
  84. :param encoding: An encoding that is used to convert the
  85. featuresets that are given to the ``classify`` method into
  86. joint-feature vectors, which are used by the maxent
  87. classifier model.
  88. :type weights: list of float
  89. :param weights: The feature weight vector for this classifier.
  90. :type logarithmic: bool
  91. :param logarithmic: If false, then use non-logarithmic weights.
  92. """
  93. self._encoding = encoding
  94. self._weights = weights
  95. self._logarithmic = logarithmic
  96. # self._logarithmic = False
  97. assert encoding.length() == len(weights)
  98. def labels(self):
  99. return self._encoding.labels()
  100. def set_weights(self, new_weights):
  101. """
  102. Set the feature weight vector for this classifier.
  103. :param new_weights: The new feature weight vector.
  104. :type new_weights: list of float
  105. """
  106. self._weights = new_weights
  107. assert self._encoding.length() == len(new_weights)
  108. def weights(self):
  109. """
  110. :return: The feature weight vector for this classifier.
  111. :rtype: list of float
  112. """
  113. return self._weights
  114. def classify(self, featureset):
  115. return self.prob_classify(featureset).max()
  116. def prob_classify(self, featureset):
  117. prob_dict = {}
  118. for label in self._encoding.labels():
  119. feature_vector = self._encoding.encode(featureset, label)
  120. if self._logarithmic:
  121. total = 0.0
  122. for (f_id, f_val) in feature_vector:
  123. total += self._weights[f_id] * f_val
  124. prob_dict[label] = total
  125. else:
  126. prod = 1.0
  127. for (f_id, f_val) in feature_vector:
  128. prod *= self._weights[f_id] ** f_val
  129. prob_dict[label] = prod
  130. # Normalize the dictionary to give a probability distribution
  131. return DictionaryProbDist(prob_dict, log=self._logarithmic, normalize=True)
  132. def explain(self, featureset, columns=4):
  133. """
  134. Print a table showing the effect of each of the features in
  135. the given feature set, and how they combine to determine the
  136. probabilities of each label for that featureset.
  137. """
  138. descr_width = 50
  139. TEMPLATE = " %-" + str(descr_width - 2) + "s%s%8.3f"
  140. pdist = self.prob_classify(featureset)
  141. labels = sorted(pdist.samples(), key=pdist.prob, reverse=True)
  142. labels = labels[:columns]
  143. print(
  144. " Feature".ljust(descr_width)
  145. + "".join("%8s" % (("%s" % l)[:7]) for l in labels)
  146. )
  147. print(" " + "-" * (descr_width - 2 + 8 * len(labels)))
  148. sums = defaultdict(int)
  149. for i, label in enumerate(labels):
  150. feature_vector = self._encoding.encode(featureset, label)
  151. feature_vector.sort(
  152. key=lambda fid__: abs(self._weights[fid__[0]]), reverse=True
  153. )
  154. for (f_id, f_val) in feature_vector:
  155. if self._logarithmic:
  156. score = self._weights[f_id] * f_val
  157. else:
  158. score = self._weights[f_id] ** f_val
  159. descr = self._encoding.describe(f_id)
  160. descr = descr.split(" and label is ")[0] # hack
  161. descr += " (%s)" % f_val # hack
  162. if len(descr) > 47:
  163. descr = descr[:44] + "..."
  164. print(TEMPLATE % (descr, i * 8 * " ", score))
  165. sums[label] += score
  166. print(" " + "-" * (descr_width - 1 + 8 * len(labels)))
  167. print(
  168. " TOTAL:".ljust(descr_width) + "".join("%8.3f" % sums[l] for l in labels)
  169. )
  170. print(
  171. " PROBS:".ljust(descr_width)
  172. + "".join("%8.3f" % pdist.prob(l) for l in labels)
  173. )
  174. def most_informative_features(self, n=10):
  175. """
  176. Generates the ranked list of informative features from most to least.
  177. """
  178. if hasattr(self, "_most_informative_features"):
  179. return self._most_informative_features[:n]
  180. else:
  181. self._most_informative_features = sorted(
  182. list(range(len(self._weights))),
  183. key=lambda fid: abs(self._weights[fid]),
  184. reverse=True,
  185. )
  186. return self._most_informative_features[:n]
  187. def show_most_informative_features(self, n=10, show="all"):
  188. """
  189. :param show: all, neg, or pos (for negative-only or positive-only)
  190. :type show: str
  191. :param n: The no. of top features
  192. :type n: int
  193. """
  194. # Use None the full list of ranked features.
  195. fids = self.most_informative_features(None)
  196. if show == "pos":
  197. fids = [fid for fid in fids if self._weights[fid] > 0]
  198. elif show == "neg":
  199. fids = [fid for fid in fids if self._weights[fid] < 0]
  200. for fid in fids[:n]:
  201. print("%8.3f %s" % (self._weights[fid], self._encoding.describe(fid)))
  202. def __repr__(self):
  203. return "<ConditionalExponentialClassifier: %d labels, %d features>" % (
  204. len(self._encoding.labels()),
  205. self._encoding.length(),
  206. )
  207. #: A list of the algorithm names that are accepted for the
  208. #: ``train()`` method's ``algorithm`` parameter.
  209. ALGORITHMS = ["GIS", "IIS", "MEGAM", "TADM"]
  210. @classmethod
  211. def train(
  212. cls,
  213. train_toks,
  214. algorithm=None,
  215. trace=3,
  216. encoding=None,
  217. labels=None,
  218. gaussian_prior_sigma=0,
  219. **cutoffs
  220. ):
  221. """
  222. Train a new maxent classifier based on the given corpus of
  223. training samples. This classifier will have its weights
  224. chosen to maximize entropy while remaining empirically
  225. consistent with the training corpus.
  226. :rtype: MaxentClassifier
  227. :return: The new maxent classifier
  228. :type train_toks: list
  229. :param train_toks: Training data, represented as a list of
  230. pairs, the first member of which is a featureset,
  231. and the second of which is a classification label.
  232. :type algorithm: str
  233. :param algorithm: A case-insensitive string, specifying which
  234. algorithm should be used to train the classifier. The
  235. following algorithms are currently available.
  236. - Iterative Scaling Methods: Generalized Iterative Scaling (``'GIS'``),
  237. Improved Iterative Scaling (``'IIS'``)
  238. - External Libraries (requiring megam):
  239. LM-BFGS algorithm, with training performed by Megam (``'megam'``)
  240. The default algorithm is ``'IIS'``.
  241. :type trace: int
  242. :param trace: The level of diagnostic tracing output to produce.
  243. Higher values produce more verbose output.
  244. :type encoding: MaxentFeatureEncodingI
  245. :param encoding: A feature encoding, used to convert featuresets
  246. into feature vectors. If none is specified, then a
  247. ``BinaryMaxentFeatureEncoding`` will be built based on the
  248. features that are attested in the training corpus.
  249. :type labels: list(str)
  250. :param labels: The set of possible labels. If none is given, then
  251. the set of all labels attested in the training data will be
  252. used instead.
  253. :param gaussian_prior_sigma: The sigma value for a gaussian
  254. prior on model weights. Currently, this is supported by
  255. ``megam``. For other algorithms, its value is ignored.
  256. :param cutoffs: Arguments specifying various conditions under
  257. which the training should be halted. (Some of the cutoff
  258. conditions are not supported by some algorithms.)
  259. - ``max_iter=v``: Terminate after ``v`` iterations.
  260. - ``min_ll=v``: Terminate after the negative average
  261. log-likelihood drops under ``v``.
  262. - ``min_lldelta=v``: Terminate if a single iteration improves
  263. log likelihood by less than ``v``.
  264. """
  265. if algorithm is None:
  266. algorithm = "iis"
  267. for key in cutoffs:
  268. if key not in (
  269. "max_iter",
  270. "min_ll",
  271. "min_lldelta",
  272. "max_acc",
  273. "min_accdelta",
  274. "count_cutoff",
  275. "norm",
  276. "explicit",
  277. "bernoulli",
  278. ):
  279. raise TypeError("Unexpected keyword arg %r" % key)
  280. algorithm = algorithm.lower()
  281. if algorithm == "iis":
  282. return train_maxent_classifier_with_iis(
  283. train_toks, trace, encoding, labels, **cutoffs
  284. )
  285. elif algorithm == "gis":
  286. return train_maxent_classifier_with_gis(
  287. train_toks, trace, encoding, labels, **cutoffs
  288. )
  289. elif algorithm == "megam":
  290. return train_maxent_classifier_with_megam(
  291. train_toks, trace, encoding, labels, gaussian_prior_sigma, **cutoffs
  292. )
  293. elif algorithm == "tadm":
  294. kwargs = cutoffs
  295. kwargs["trace"] = trace
  296. kwargs["encoding"] = encoding
  297. kwargs["labels"] = labels
  298. kwargs["gaussian_prior_sigma"] = gaussian_prior_sigma
  299. return TadmMaxentClassifier.train(train_toks, **kwargs)
  300. else:
  301. raise ValueError("Unknown algorithm %s" % algorithm)
  302. #: Alias for MaxentClassifier.
  303. ConditionalExponentialClassifier = MaxentClassifier
  304. ######################################################################
  305. # { Feature Encodings
  306. ######################################################################
  307. class MaxentFeatureEncodingI(object):
  308. """
  309. A mapping that converts a set of input-feature values to a vector
  310. of joint-feature values, given a label. This conversion is
  311. necessary to translate featuresets into a format that can be used
  312. by maximum entropy models.
  313. The set of joint-features used by a given encoding is fixed, and
  314. each index in the generated joint-feature vectors corresponds to a
  315. single joint-feature. The length of the generated joint-feature
  316. vectors is therefore constant (for a given encoding).
  317. Because the joint-feature vectors generated by
  318. ``MaxentFeatureEncodingI`` are typically very sparse, they are
  319. represented as a list of ``(index, value)`` tuples, specifying the
  320. value of each non-zero joint-feature.
  321. Feature encodings are generally created using the ``train()``
  322. method, which generates an appropriate encoding based on the
  323. input-feature values and labels that are present in a given
  324. corpus.
  325. """
  326. def encode(self, featureset, label):
  327. """
  328. Given a (featureset, label) pair, return the corresponding
  329. vector of joint-feature values. This vector is represented as
  330. a list of ``(index, value)`` tuples, specifying the value of
  331. each non-zero joint-feature.
  332. :type featureset: dict
  333. :rtype: list(tuple(int, int))
  334. """
  335. raise NotImplementedError()
  336. def length(self):
  337. """
  338. :return: The size of the fixed-length joint-feature vectors
  339. that are generated by this encoding.
  340. :rtype: int
  341. """
  342. raise NotImplementedError()
  343. def labels(self):
  344. """
  345. :return: A list of the \"known labels\" -- i.e., all labels
  346. ``l`` such that ``self.encode(fs,l)`` can be a nonzero
  347. joint-feature vector for some value of ``fs``.
  348. :rtype: list
  349. """
  350. raise NotImplementedError()
  351. def describe(self, fid):
  352. """
  353. :return: A string describing the value of the joint-feature
  354. whose index in the generated feature vectors is ``fid``.
  355. :rtype: str
  356. """
  357. raise NotImplementedError()
  358. def train(cls, train_toks):
  359. """
  360. Construct and return new feature encoding, based on a given
  361. training corpus ``train_toks``.
  362. :type train_toks: list(tuple(dict, str))
  363. :param train_toks: Training data, represented as a list of
  364. pairs, the first member of which is a feature dictionary,
  365. and the second of which is a classification label.
  366. """
  367. raise NotImplementedError()
  368. class FunctionBackedMaxentFeatureEncoding(MaxentFeatureEncodingI):
  369. """
  370. A feature encoding that calls a user-supplied function to map a
  371. given featureset/label pair to a sparse joint-feature vector.
  372. """
  373. def __init__(self, func, length, labels):
  374. """
  375. Construct a new feature encoding based on the given function.
  376. :type func: (callable)
  377. :param func: A function that takes two arguments, a featureset
  378. and a label, and returns the sparse joint feature vector
  379. that encodes them::
  380. func(featureset, label) -> feature_vector
  381. This sparse joint feature vector (``feature_vector``) is a
  382. list of ``(index,value)`` tuples.
  383. :type length: int
  384. :param length: The size of the fixed-length joint-feature
  385. vectors that are generated by this encoding.
  386. :type labels: list
  387. :param labels: A list of the \"known labels\" for this
  388. encoding -- i.e., all labels ``l`` such that
  389. ``self.encode(fs,l)`` can be a nonzero joint-feature vector
  390. for some value of ``fs``.
  391. """
  392. self._length = length
  393. self._func = func
  394. self._labels = labels
  395. def encode(self, featureset, label):
  396. return self._func(featureset, label)
  397. def length(self):
  398. return self._length
  399. def labels(self):
  400. return self._labels
  401. def describe(self, fid):
  402. return "no description available"
  403. class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI):
  404. """
  405. A feature encoding that generates vectors containing a binary
  406. joint-features of the form:
  407. | joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
  408. | {
  409. | { 0 otherwise
  410. Where ``fname`` is the name of an input-feature, ``fval`` is a value
  411. for that input-feature, and ``label`` is a label.
  412. Typically, these features are constructed based on a training
  413. corpus, using the ``train()`` method. This method will create one
  414. feature for each combination of ``fname``, ``fval``, and ``label``
  415. that occurs at least once in the training corpus.
  416. The ``unseen_features`` parameter can be used to add "unseen-value
  417. features", which are used whenever an input feature has a value
  418. that was not encountered in the training corpus. These features
  419. have the form:
  420. | joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
  421. | { and l == label
  422. | {
  423. | { 0 otherwise
  424. Where ``is_unseen(fname, fval)`` is true if the encoding does not
  425. contain any joint features that are true when ``fs[fname]==fval``.
  426. The ``alwayson_features`` parameter can be used to add "always-on
  427. features", which have the form::
  428. | joint_feat(fs, l) = { 1 if (l == label)
  429. | {
  430. | { 0 otherwise
  431. These always-on features allow the maxent model to directly model
  432. the prior probabilities of each label.
  433. """
  434. def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
  435. """
  436. :param labels: A list of the \"known labels\" for this encoding.
  437. :param mapping: A dictionary mapping from ``(fname,fval,label)``
  438. tuples to corresponding joint-feature indexes. These
  439. indexes must be the set of integers from 0...len(mapping).
  440. If ``mapping[fname,fval,label]=id``, then
  441. ``self.encode(..., fname:fval, ..., label)[id]`` is 1;
  442. otherwise, it is 0.
  443. :param unseen_features: If true, then include unseen value
  444. features in the generated joint-feature vectors.
  445. :param alwayson_features: If true, then include always-on
  446. features in the generated joint-feature vectors.
  447. """
  448. if set(mapping.values()) != set(range(len(mapping))):
  449. raise ValueError(
  450. "Mapping values must be exactly the "
  451. "set of integers from 0...len(mapping)"
  452. )
  453. self._labels = list(labels)
  454. """A list of attested labels."""
  455. self._mapping = mapping
  456. """dict mapping from (fname,fval,label) -> fid"""
  457. self._length = len(mapping)
  458. """The length of generated joint feature vectors."""
  459. self._alwayson = None
  460. """dict mapping from label -> fid"""
  461. self._unseen = None
  462. """dict mapping from fname -> fid"""
  463. if alwayson_features:
  464. self._alwayson = dict(
  465. (label, i + self._length) for (i, label) in enumerate(labels)
  466. )
  467. self._length += len(self._alwayson)
  468. if unseen_features:
  469. fnames = set(fname for (fname, fval, label) in mapping)
  470. self._unseen = dict(
  471. (fname, i + self._length) for (i, fname) in enumerate(fnames)
  472. )
  473. self._length += len(fnames)
  474. def encode(self, featureset, label):
  475. # Inherit docs.
  476. encoding = []
  477. # Convert input-features to joint-features:
  478. for fname, fval in featureset.items():
  479. # Known feature name & value:
  480. if (fname, fval, label) in self._mapping:
  481. encoding.append((self._mapping[fname, fval, label], 1))
  482. # Otherwise, we might want to fire an "unseen-value feature".
  483. elif self._unseen:
  484. # Have we seen this fname/fval combination with any label?
  485. for label2 in self._labels:
  486. if (fname, fval, label2) in self._mapping:
  487. break # we've seen this fname/fval combo
  488. # We haven't -- fire the unseen-value feature
  489. else:
  490. if fname in self._unseen:
  491. encoding.append((self._unseen[fname], 1))
  492. # Add always-on features:
  493. if self._alwayson and label in self._alwayson:
  494. encoding.append((self._alwayson[label], 1))
  495. return encoding
  496. def describe(self, f_id):
  497. # Inherit docs.
  498. if not isinstance(f_id, int):
  499. raise TypeError("describe() expected an int")
  500. try:
  501. self._inv_mapping
  502. except AttributeError:
  503. self._inv_mapping = [-1] * len(self._mapping)
  504. for (info, i) in self._mapping.items():
  505. self._inv_mapping[i] = info
  506. if f_id < len(self._mapping):
  507. (fname, fval, label) = self._inv_mapping[f_id]
  508. return "%s==%r and label is %r" % (fname, fval, label)
  509. elif self._alwayson and f_id in self._alwayson.values():
  510. for (label, f_id2) in self._alwayson.items():
  511. if f_id == f_id2:
  512. return "label is %r" % label
  513. elif self._unseen and f_id in self._unseen.values():
  514. for (fname, f_id2) in self._unseen.items():
  515. if f_id == f_id2:
  516. return "%s is unseen" % fname
  517. else:
  518. raise ValueError("Bad feature id")
  519. def labels(self):
  520. # Inherit docs.
  521. return self._labels
  522. def length(self):
  523. # Inherit docs.
  524. return self._length
  525. @classmethod
  526. def train(cls, train_toks, count_cutoff=0, labels=None, **options):
  527. """
  528. Construct and return new feature encoding, based on a given
  529. training corpus ``train_toks``. See the class description
  530. ``BinaryMaxentFeatureEncoding`` for a description of the
  531. joint-features that will be included in this encoding.
  532. :type train_toks: list(tuple(dict, str))
  533. :param train_toks: Training data, represented as a list of
  534. pairs, the first member of which is a feature dictionary,
  535. and the second of which is a classification label.
  536. :type count_cutoff: int
  537. :param count_cutoff: A cutoff value that is used to discard
  538. rare joint-features. If a joint-feature's value is 1
  539. fewer than ``count_cutoff`` times in the training corpus,
  540. then that joint-feature is not included in the generated
  541. encoding.
  542. :type labels: list
  543. :param labels: A list of labels that should be used by the
  544. classifier. If not specified, then the set of labels
  545. attested in ``train_toks`` will be used.
  546. :param options: Extra parameters for the constructor, such as
  547. ``unseen_features`` and ``alwayson_features``.
  548. """
  549. mapping = {} # maps (fname, fval, label) -> fid
  550. seen_labels = set() # The set of labels we've encountered
  551. count = defaultdict(int) # maps (fname, fval) -> count
  552. for (tok, label) in train_toks:
  553. if labels and label not in labels:
  554. raise ValueError("Unexpected label %s" % label)
  555. seen_labels.add(label)
  556. # Record each of the features.
  557. for (fname, fval) in tok.items():
  558. # If a count cutoff is given, then only add a joint
  559. # feature once the corresponding (fname, fval, label)
  560. # tuple exceeds that cutoff.
  561. count[fname, fval] += 1
  562. if count[fname, fval] >= count_cutoff:
  563. if (fname, fval, label) not in mapping:
  564. mapping[fname, fval, label] = len(mapping)
  565. if labels is None:
  566. labels = seen_labels
  567. return cls(labels, mapping, **options)
  568. class GISEncoding(BinaryMaxentFeatureEncoding):
  569. """
  570. A binary feature encoding which adds one new joint-feature to the
  571. joint-features defined by ``BinaryMaxentFeatureEncoding``: a
  572. correction feature, whose value is chosen to ensure that the
  573. sparse vector always sums to a constant non-negative number. This
  574. new feature is used to ensure two preconditions for the GIS
  575. training algorithm:
  576. - At least one feature vector index must be nonzero for every
  577. token.
  578. - The feature vector must sum to a constant non-negative number
  579. for every token.
  580. """
  581. def __init__(
  582. self, labels, mapping, unseen_features=False, alwayson_features=False, C=None
  583. ):
  584. """
  585. :param C: The correction constant. The value of the correction
  586. feature is based on this value. In particular, its value is
  587. ``C - sum([v for (f,v) in encoding])``.
  588. :seealso: ``BinaryMaxentFeatureEncoding.__init__``
  589. """
  590. BinaryMaxentFeatureEncoding.__init__(
  591. self, labels, mapping, unseen_features, alwayson_features
  592. )
  593. if C is None:
  594. C = len(set(fname for (fname, fval, label) in mapping)) + 1
  595. self._C = C
  596. @property
  597. def C(self):
  598. """The non-negative constant that all encoded feature vectors
  599. will sum to."""
  600. return self._C
  601. def encode(self, featureset, label):
  602. # Get the basic encoding.
  603. encoding = BinaryMaxentFeatureEncoding.encode(self, featureset, label)
  604. base_length = BinaryMaxentFeatureEncoding.length(self)
  605. # Add a correction feature.
  606. total = sum(v for (f, v) in encoding)
  607. if total >= self._C:
  608. raise ValueError("Correction feature is not high enough!")
  609. encoding.append((base_length, self._C - total))
  610. # Return the result
  611. return encoding
  612. def length(self):
  613. return BinaryMaxentFeatureEncoding.length(self) + 1
  614. def describe(self, f_id):
  615. if f_id == BinaryMaxentFeatureEncoding.length(self):
  616. return "Correction feature (%s)" % self._C
  617. else:
  618. return BinaryMaxentFeatureEncoding.describe(self, f_id)
  619. class TadmEventMaxentFeatureEncoding(BinaryMaxentFeatureEncoding):
  620. def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
  621. self._mapping = OrderedDict(mapping)
  622. self._label_mapping = OrderedDict()
  623. BinaryMaxentFeatureEncoding.__init__(
  624. self, labels, self._mapping, unseen_features, alwayson_features
  625. )
  626. def encode(self, featureset, label):
  627. encoding = []
  628. for feature, value in featureset.items():
  629. if (feature, label) not in self._mapping:
  630. self._mapping[(feature, label)] = len(self._mapping)
  631. if value not in self._label_mapping:
  632. if not isinstance(value, int):
  633. self._label_mapping[value] = len(self._label_mapping)
  634. else:
  635. self._label_mapping[value] = value
  636. encoding.append(
  637. (self._mapping[(feature, label)], self._label_mapping[value])
  638. )
  639. return encoding
  640. def labels(self):
  641. return self._labels
  642. def describe(self, fid):
  643. for (feature, label) in self._mapping:
  644. if self._mapping[(feature, label)] == fid:
  645. return (feature, label)
  646. def length(self):
  647. return len(self._mapping)
  648. @classmethod
  649. def train(cls, train_toks, count_cutoff=0, labels=None, **options):
  650. mapping = OrderedDict()
  651. if not labels:
  652. labels = []
  653. # This gets read twice, so compute the values in case it's lazy.
  654. train_toks = list(train_toks)
  655. for (featureset, label) in train_toks:
  656. if label not in labels:
  657. labels.append(label)
  658. for (featureset, label) in train_toks:
  659. for label in labels:
  660. for feature in featureset:
  661. if (feature, label) not in mapping:
  662. mapping[(feature, label)] = len(mapping)
  663. return cls(labels, mapping, **options)
  664. class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI):
  665. """
  666. A feature encoding that generates vectors containing integer,
  667. float and binary joint-features of the form:
  668. Binary (for string and boolean features):
  669. | joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
  670. | {
  671. | { 0 otherwise
  672. Value (for integer and float features):
  673. | joint_feat(fs, l) = { fval if (fs[fname] == type(fval))
  674. | { and (l == label)
  675. | {
  676. | { not encoded otherwise
  677. Where ``fname`` is the name of an input-feature, ``fval`` is a value
  678. for that input-feature, and ``label`` is a label.
  679. Typically, these features are constructed based on a training
  680. corpus, using the ``train()`` method.
  681. For string and boolean features [type(fval) not in (int, float)]
  682. this method will create one feature for each combination of
  683. ``fname``, ``fval``, and ``label`` that occurs at least once in the
  684. training corpus.
  685. For integer and float features [type(fval) in (int, float)] this
  686. method will create one feature for each combination of ``fname``
  687. and ``label`` that occurs at least once in the training corpus.
  688. For binary features the ``unseen_features`` parameter can be used
  689. to add "unseen-value features", which are used whenever an input
  690. feature has a value that was not encountered in the training
  691. corpus. These features have the form:
  692. | joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
  693. | { and l == label
  694. | {
  695. | { 0 otherwise
  696. Where ``is_unseen(fname, fval)`` is true if the encoding does not
  697. contain any joint features that are true when ``fs[fname]==fval``.
  698. The ``alwayson_features`` parameter can be used to add "always-on
  699. features", which have the form:
  700. | joint_feat(fs, l) = { 1 if (l == label)
  701. | {
  702. | { 0 otherwise
  703. These always-on features allow the maxent model to directly model
  704. the prior probabilities of each label.
  705. """
  706. def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
  707. """
  708. :param labels: A list of the \"known labels\" for this encoding.
  709. :param mapping: A dictionary mapping from ``(fname,fval,label)``
  710. tuples to corresponding joint-feature indexes. These
  711. indexes must be the set of integers from 0...len(mapping).
  712. If ``mapping[fname,fval,label]=id``, then
  713. ``self.encode({..., fname:fval, ...``, label)[id]} is 1;
  714. otherwise, it is 0.
  715. :param unseen_features: If true, then include unseen value
  716. features in the generated joint-feature vectors.
  717. :param alwayson_features: If true, then include always-on
  718. features in the generated joint-feature vectors.
  719. """
  720. if set(mapping.values()) != set(range(len(mapping))):
  721. raise ValueError(
  722. "Mapping values must be exactly the "
  723. "set of integers from 0...len(mapping)"
  724. )
  725. self._labels = list(labels)
  726. """A list of attested labels."""
  727. self._mapping = mapping
  728. """dict mapping from (fname,fval,label) -> fid"""
  729. self._length = len(mapping)
  730. """The length of generated joint feature vectors."""
  731. self._alwayson = None
  732. """dict mapping from label -> fid"""
  733. self._unseen = None
  734. """dict mapping from fname -> fid"""
  735. if alwayson_features:
  736. self._alwayson = dict(
  737. (label, i + self._length) for (i, label) in enumerate(labels)
  738. )
  739. self._length += len(self._alwayson)
  740. if unseen_features:
  741. fnames = set(fname for (fname, fval, label) in mapping)
  742. self._unseen = dict(
  743. (fname, i + self._length) for (i, fname) in enumerate(fnames)
  744. )
  745. self._length += len(fnames)
  746. def encode(self, featureset, label):
  747. # Inherit docs.
  748. encoding = []
  749. # Convert input-features to joint-features:
  750. for fname, fval in featureset.items():
  751. if isinstance(fval, (int, float)):
  752. # Known feature name & value:
  753. if (fname, type(fval), label) in self._mapping:
  754. encoding.append((self._mapping[fname, type(fval), label], fval))
  755. else:
  756. # Known feature name & value:
  757. if (fname, fval, label) in self._mapping:
  758. encoding.append((self._mapping[fname, fval, label], 1))
  759. # Otherwise, we might want to fire an "unseen-value feature".
  760. elif self._unseen:
  761. # Have we seen this fname/fval combination with any label?
  762. for label2 in self._labels:
  763. if (fname, fval, label2) in self._mapping:
  764. break # we've seen this fname/fval combo
  765. # We haven't -- fire the unseen-value feature
  766. else:
  767. if fname in self._unseen:
  768. encoding.append((self._unseen[fname], 1))
  769. # Add always-on features:
  770. if self._alwayson and label in self._alwayson:
  771. encoding.append((self._alwayson[label], 1))
  772. return encoding
  773. def describe(self, f_id):
  774. # Inherit docs.
  775. if not isinstance(f_id, int):
  776. raise TypeError("describe() expected an int")
  777. try:
  778. self._inv_mapping
  779. except AttributeError:
  780. self._inv_mapping = [-1] * len(self._mapping)
  781. for (info, i) in self._mapping.items():
  782. self._inv_mapping[i] = info
  783. if f_id < len(self._mapping):
  784. (fname, fval, label) = self._inv_mapping[f_id]
  785. return "%s==%r and label is %r" % (fname, fval, label)
  786. elif self._alwayson and f_id in self._alwayson.values():
  787. for (label, f_id2) in self._alwayson.items():
  788. if f_id == f_id2:
  789. return "label is %r" % label
  790. elif self._unseen and f_id in self._unseen.values():
  791. for (fname, f_id2) in self._unseen.items():
  792. if f_id == f_id2:
  793. return "%s is unseen" % fname
  794. else:
  795. raise ValueError("Bad feature id")
  796. def labels(self):
  797. # Inherit docs.
  798. return self._labels
  799. def length(self):
  800. # Inherit docs.
  801. return self._length
  802. @classmethod
  803. def train(cls, train_toks, count_cutoff=0, labels=None, **options):
  804. """
  805. Construct and return new feature encoding, based on a given
  806. training corpus ``train_toks``. See the class description
  807. ``TypedMaxentFeatureEncoding`` for a description of the
  808. joint-features that will be included in this encoding.
  809. Note: recognized feature values types are (int, float), over
  810. types are interpreted as regular binary features.
  811. :type train_toks: list(tuple(dict, str))
  812. :param train_toks: Training data, represented as a list of
  813. pairs, the first member of which is a feature dictionary,
  814. and the second of which is a classification label.
  815. :type count_cutoff: int
  816. :param count_cutoff: A cutoff value that is used to discard
  817. rare joint-features. If a joint-feature's value is 1
  818. fewer than ``count_cutoff`` times in the training corpus,
  819. then that joint-feature is not included in the generated
  820. encoding.
  821. :type labels: list
  822. :param labels: A list of labels that should be used by the
  823. classifier. If not specified, then the set of labels
  824. attested in ``train_toks`` will be used.
  825. :param options: Extra parameters for the constructor, such as
  826. ``unseen_features`` and ``alwayson_features``.
  827. """
  828. mapping = {} # maps (fname, fval, label) -> fid
  829. seen_labels = set() # The set of labels we've encountered
  830. count = defaultdict(int) # maps (fname, fval) -> count
  831. for (tok, label) in train_toks:
  832. if labels and label not in labels:
  833. raise ValueError("Unexpected label %s" % label)
  834. seen_labels.add(label)
  835. # Record each of the features.
  836. for (fname, fval) in tok.items():
  837. if type(fval) in (int, float):
  838. fval = type(fval)
  839. # If a count cutoff is given, then only add a joint
  840. # feature once the corresponding (fname, fval, label)
  841. # tuple exceeds that cutoff.
  842. count[fname, fval] += 1
  843. if count[fname, fval] >= count_cutoff:
  844. if (fname, fval, label) not in mapping:
  845. mapping[fname, fval, label] = len(mapping)
  846. if labels is None:
  847. labels = seen_labels
  848. return cls(labels, mapping, **options)
  849. ######################################################################
  850. # { Classifier Trainer: Generalized Iterative Scaling
  851. ######################################################################
  852. def train_maxent_classifier_with_gis(
  853. train_toks, trace=3, encoding=None, labels=None, **cutoffs
  854. ):
  855. """
  856. Train a new ``ConditionalExponentialClassifier``, using the given
  857. training samples, using the Generalized Iterative Scaling
  858. algorithm. This ``ConditionalExponentialClassifier`` will encode
  859. the model that maximizes entropy from all the models that are
  860. empirically consistent with ``train_toks``.
  861. :see: ``train_maxent_classifier()`` for parameter descriptions.
  862. """
  863. cutoffs.setdefault("max_iter", 100)
  864. cutoffchecker = CutoffChecker(cutoffs)
  865. # Construct an encoding from the training data.
  866. if encoding is None:
  867. encoding = GISEncoding.train(train_toks, labels=labels)
  868. if not hasattr(encoding, "C"):
  869. raise TypeError(
  870. "The GIS algorithm requires an encoding that "
  871. "defines C (e.g., GISEncoding)."
  872. )
  873. # Cinv is the inverse of the sum of each joint feature vector.
  874. # This controls the learning rate: higher Cinv (or lower C) gives
  875. # faster learning.
  876. Cinv = 1.0 / encoding.C
  877. # Count how many times each feature occurs in the training data.
  878. empirical_fcount = calculate_empirical_fcount(train_toks, encoding)
  879. # Check for any features that are not attested in train_toks.
  880. unattested = set(numpy.nonzero(empirical_fcount == 0)[0])
  881. # Build the classifier. Start with weight=0 for each attested
  882. # feature, and weight=-infinity for each unattested feature.
  883. weights = numpy.zeros(len(empirical_fcount), "d")
  884. for fid in unattested:
  885. weights[fid] = numpy.NINF
  886. classifier = ConditionalExponentialClassifier(encoding, weights)
  887. # Take the log of the empirical fcount.
  888. log_empirical_fcount = numpy.log2(empirical_fcount)
  889. del empirical_fcount
  890. if trace > 0:
  891. print(" ==> Training (%d iterations)" % cutoffs["max_iter"])
  892. if trace > 2:
  893. print()
  894. print(" Iteration Log Likelihood Accuracy")
  895. print(" ---------------------------------------")
  896. # Train the classifier.
  897. try:
  898. while True:
  899. if trace > 2:
  900. ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
  901. acc = cutoffchecker.acc or accuracy(classifier, train_toks)
  902. iternum = cutoffchecker.iter
  903. print(" %9d %14.5f %9.3f" % (iternum, ll, acc))
  904. # Use the model to estimate the number of times each
  905. # feature should occur in the training data.
  906. estimated_fcount = calculate_estimated_fcount(
  907. classifier, train_toks, encoding
  908. )
  909. # Take the log of estimated fcount (avoid taking log(0).)
  910. for fid in unattested:
  911. estimated_fcount[fid] += 1
  912. log_estimated_fcount = numpy.log2(estimated_fcount)
  913. del estimated_fcount
  914. # Update the classifier weights
  915. weights = classifier.weights()
  916. weights += (log_empirical_fcount - log_estimated_fcount) * Cinv
  917. classifier.set_weights(weights)
  918. # Check the log-likelihood & accuracy cutoffs.
  919. if cutoffchecker.check(classifier, train_toks):
  920. break
  921. except KeyboardInterrupt:
  922. print(" Training stopped: keyboard interrupt")
  923. except:
  924. raise
  925. if trace > 2:
  926. ll = log_likelihood(classifier, train_toks)
  927. acc = accuracy(classifier, train_toks)
  928. print(" Final %14.5f %9.3f" % (ll, acc))
  929. # Return the classifier.
  930. return classifier
  931. def calculate_empirical_fcount(train_toks, encoding):
  932. fcount = numpy.zeros(encoding.length(), "d")
  933. for tok, label in train_toks:
  934. for (index, val) in encoding.encode(tok, label):
  935. fcount[index] += val
  936. return fcount
  937. def calculate_estimated_fcount(classifier, train_toks, encoding):
  938. fcount = numpy.zeros(encoding.length(), "d")
  939. for tok, label in train_toks:
  940. pdist = classifier.prob_classify(tok)
  941. for label in pdist.samples():
  942. prob = pdist.prob(label)
  943. for (fid, fval) in encoding.encode(tok, label):
  944. fcount[fid] += prob * fval
  945. return fcount
  946. ######################################################################
  947. # { Classifier Trainer: Improved Iterative Scaling
  948. ######################################################################
  949. def train_maxent_classifier_with_iis(
  950. train_toks, trace=3, encoding=None, labels=None, **cutoffs
  951. ):
  952. """
  953. Train a new ``ConditionalExponentialClassifier``, using the given
  954. training samples, using the Improved Iterative Scaling algorithm.
  955. This ``ConditionalExponentialClassifier`` will encode the model
  956. that maximizes entropy from all the models that are empirically
  957. consistent with ``train_toks``.
  958. :see: ``train_maxent_classifier()`` for parameter descriptions.
  959. """
  960. cutoffs.setdefault("max_iter", 100)
  961. cutoffchecker = CutoffChecker(cutoffs)
  962. # Construct an encoding from the training data.
  963. if encoding is None:
  964. encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels)
  965. # Count how many times each feature occurs in the training data.
  966. empirical_ffreq = calculate_empirical_fcount(train_toks, encoding) / len(train_toks)
  967. # Find the nf map, and related variables nfarray and nfident.
  968. # nf is the sum of the features for a given labeled text.
  969. # nfmap compresses this sparse set of values to a dense list.
  970. # nfarray performs the reverse operation. nfident is
  971. # nfarray multiplied by an identity matrix.
  972. nfmap = calculate_nfmap(train_toks, encoding)
  973. nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), "d")
  974. nftranspose = numpy.reshape(nfarray, (len(nfarray), 1))
  975. # Check for any features that are not attested in train_toks.
  976. unattested = set(numpy.nonzero(empirical_ffreq == 0)[0])
  977. # Build the classifier. Start with weight=0 for each attested
  978. # feature, and weight=-infinity for each unattested feature.
  979. weights = numpy.zeros(len(empirical_ffreq), "d")
  980. for fid in unattested:
  981. weights[fid] = numpy.NINF
  982. classifier = ConditionalExponentialClassifier(encoding, weights)
  983. if trace > 0:
  984. print(" ==> Training (%d iterations)" % cutoffs["max_iter"])
  985. if trace > 2:
  986. print()
  987. print(" Iteration Log Likelihood Accuracy")
  988. print(" ---------------------------------------")
  989. # Train the classifier.
  990. try:
  991. while True:
  992. if trace > 2:
  993. ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
  994. acc = cutoffchecker.acc or accuracy(classifier, train_toks)
  995. iternum = cutoffchecker.iter
  996. print(" %9d %14.5f %9.3f" % (iternum, ll, acc))
  997. # Calculate the deltas for this iteration, using Newton's method.
  998. deltas = calculate_deltas(
  999. train_toks,
  1000. classifier,
  1001. unattested,
  1002. empirical_ffreq,
  1003. nfmap,
  1004. nfarray,
  1005. nftranspose,
  1006. encoding,
  1007. )
  1008. # Use the deltas to update our weights.
  1009. weights = classifier.weights()
  1010. weights += deltas
  1011. classifier.set_weights(weights)
  1012. # Check the log-likelihood & accuracy cutoffs.
  1013. if cutoffchecker.check(classifier, train_toks):
  1014. break
  1015. except KeyboardInterrupt:
  1016. print(" Training stopped: keyboard interrupt")
  1017. except:
  1018. raise
  1019. if trace > 2:
  1020. ll = log_likelihood(classifier, train_toks)
  1021. acc = accuracy(classifier, train_toks)
  1022. print(" Final %14.5f %9.3f" % (ll, acc))
  1023. # Return the classifier.
  1024. return classifier
  1025. def calculate_nfmap(train_toks, encoding):
  1026. """
  1027. Construct a map that can be used to compress ``nf`` (which is
  1028. typically sparse).
  1029. *nf(feature_vector)* is the sum of the feature values for
  1030. *feature_vector*.
  1031. This represents the number of features that are active for a
  1032. given labeled text. This method finds all values of *nf(t)*
  1033. that are attested for at least one token in the given list of
  1034. training tokens; and constructs a dictionary mapping these
  1035. attested values to a continuous range *0...N*. For example,
  1036. if the only values of *nf()* that were attested were 3, 5, and
  1037. 7, then ``_nfmap`` might return the dictionary ``{3:0, 5:1, 7:2}``.
  1038. :return: A map that can be used to compress ``nf`` to a dense
  1039. vector.
  1040. :rtype: dict(int -> int)
  1041. """
  1042. # Map from nf to indices. This allows us to use smaller arrays.
  1043. nfset = set()
  1044. for tok, _ in train_toks:
  1045. for label in encoding.labels():
  1046. nfset.add(sum(val for (id, val) in encoding.encode(tok, label)))
  1047. return dict((nf, i) for (i, nf) in enumerate(nfset))
  1048. def calculate_deltas(
  1049. train_toks,
  1050. classifier,
  1051. unattested,
  1052. ffreq_empirical,
  1053. nfmap,
  1054. nfarray,
  1055. nftranspose,
  1056. encoding,
  1057. ):
  1058. """
  1059. Calculate the update values for the classifier weights for
  1060. this iteration of IIS. These update weights are the value of
  1061. ``delta`` that solves the equation::
  1062. ffreq_empirical[i]
  1063. =
  1064. SUM[fs,l] (classifier.prob_classify(fs).prob(l) *
  1065. feature_vector(fs,l)[i] *
  1066. exp(delta[i] * nf(feature_vector(fs,l))))
  1067. Where:
  1068. - *(fs,l)* is a (featureset, label) tuple from ``train_toks``
  1069. - *feature_vector(fs,l)* = ``encoding.encode(fs,l)``
  1070. - *nf(vector)* = ``sum([val for (id,val) in vector])``
  1071. This method uses Newton's method to solve this equation for
  1072. *delta[i]*. In particular, it starts with a guess of
  1073. ``delta[i]`` = 1; and iteratively updates ``delta`` with:
  1074. | delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i])
  1075. until convergence, where *sum1* and *sum2* are defined as:
  1076. | sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta)
  1077. | sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta).nf(feature_vector(fs,l)))
  1078. | f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) .
  1079. | feature_vector(fs,l)[i] .
  1080. | exp(delta[i] . nf(feature_vector(fs,l))))
  1081. Note that *sum1* and *sum2* depend on ``delta``; so they need
  1082. to be re-computed each iteration.
  1083. The variables ``nfmap``, ``nfarray``, and ``nftranspose`` are
  1084. used to generate a dense encoding for *nf(ltext)*. This
  1085. allows ``_deltas`` to calculate *sum1* and *sum2* using
  1086. matrices, which yields a significant performance improvement.
  1087. :param train_toks: The set of training tokens.
  1088. :type train_toks: list(tuple(dict, str))
  1089. :param classifier: The current classifier.
  1090. :type classifier: ClassifierI
  1091. :param ffreq_empirical: An array containing the empirical
  1092. frequency for each feature. The *i*\ th element of this
  1093. array is the empirical frequency for feature *i*.
  1094. :type ffreq_empirical: sequence of float
  1095. :param unattested: An array that is 1 for features that are
  1096. not attested in the training data; and 0 for features that
  1097. are attested. In other words, ``unattested[i]==0`` iff
  1098. ``ffreq_empirical[i]==0``.
  1099. :type unattested: sequence of int
  1100. :param nfmap: A map that can be used to compress ``nf`` to a dense
  1101. vector.
  1102. :type nfmap: dict(int -> int)
  1103. :param nfarray: An array that can be used to uncompress ``nf``
  1104. from a dense vector.
  1105. :type nfarray: array(float)
  1106. :param nftranspose: The transpose of ``nfarray``
  1107. :type nftranspose: array(float)
  1108. """
  1109. # These parameters control when we decide that we've
  1110. # converged. It probably should be possible to set these
  1111. # manually, via keyword arguments to train.
  1112. NEWTON_CONVERGE = 1e-12
  1113. MAX_NEWTON = 300
  1114. deltas = numpy.ones(encoding.length(), "d")
  1115. # Precompute the A matrix:
  1116. # A[nf][id] = sum ( p(fs) * p(label|fs) * f(fs,label) )
  1117. # over all label,fs s.t. num_features[label,fs]=nf
  1118. A = numpy.zeros((len(nfmap), encoding.length()), "d")
  1119. for tok, label in train_toks:
  1120. dist = classifier.prob_classify(tok)
  1121. for label in encoding.labels():
  1122. # Generate the feature vector
  1123. feature_vector = encoding.encode(tok, label)
  1124. # Find the number of active features
  1125. nf = sum(val for (id, val) in feature_vector)
  1126. # Update the A matrix
  1127. for (id, val) in feature_vector:
  1128. A[nfmap[nf], id] += dist.prob(label) * val
  1129. A /= len(train_toks)
  1130. # Iteratively solve for delta. Use the following variables:
  1131. # - nf_delta[x][y] = nfarray[x] * delta[y]
  1132. # - exp_nf_delta[x][y] = exp(nf[x] * delta[y])
  1133. # - nf_exp_nf_delta[x][y] = nf[x] * exp(nf[x] * delta[y])
  1134. # - sum1[i][nf] = sum p(fs)p(label|fs)f[i](label,fs)
  1135. # exp(delta[i]nf)
  1136. # - sum2[i][nf] = sum p(fs)p(label|fs)f[i](label,fs)
  1137. # nf exp(delta[i]nf)
  1138. for rangenum in range(MAX_NEWTON):
  1139. nf_delta = numpy.outer(nfarray, deltas)
  1140. exp_nf_delta = 2 ** nf_delta
  1141. nf_exp_nf_delta = nftranspose * exp_nf_delta
  1142. sum1 = numpy.sum(exp_nf_delta * A, axis=0)
  1143. sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)
  1144. # Avoid division by zero.
  1145. for fid in unattested:
  1146. sum2[fid] += 1
  1147. # Update the deltas.
  1148. deltas -= (ffreq_empirical - sum1) / -sum2
  1149. # We can stop once we converge.
  1150. n_error = numpy.sum(abs((ffreq_empirical - sum1))) / numpy.sum(abs(deltas))
  1151. if n_error < NEWTON_CONVERGE:
  1152. return deltas
  1153. return deltas
  1154. ######################################################################
  1155. # { Classifier Trainer: megam
  1156. ######################################################################
  1157. # [xx] possible extension: add support for using implicit file format;
  1158. # this would need to put requirements on what encoding is used. But
  1159. # we may need this for other maxent classifier trainers that require
  1160. # implicit formats anyway.
  1161. def train_maxent_classifier_with_megam(
  1162. train_toks, trace=3, encoding=None, labels=None, gaussian_prior_sigma=0, **kwargs
  1163. ):
  1164. """
  1165. Train a new ``ConditionalExponentialClassifier``, using the given
  1166. training samples, using the external ``megam`` library. This
  1167. ``ConditionalExponentialClassifier`` will encode the model that
  1168. maximizes entropy from all the models that are empirically
  1169. consistent with ``train_toks``.
  1170. :see: ``train_maxent_classifier()`` for parameter descriptions.
  1171. :see: ``nltk.classify.megam``
  1172. """
  1173. explicit = True
  1174. bernoulli = True
  1175. if "explicit" in kwargs:
  1176. explicit = kwargs["explicit"]
  1177. if "bernoulli" in kwargs:
  1178. bernoulli = kwargs["bernoulli"]
  1179. # Construct an encoding from the training data.
  1180. if encoding is None:
  1181. # Count cutoff can also be controlled by megam with the -minfc
  1182. # option. Not sure where the best place for it is.
  1183. count_cutoff = kwargs.get("count_cutoff", 0)
  1184. encoding = BinaryMaxentFeatureEncoding.train(
  1185. train_toks, count_cutoff, labels=labels, alwayson_features=True
  1186. )
  1187. elif labels is not None:
  1188. raise ValueError("Specify encoding or labels, not both")
  1189. # Write a training file for megam.
  1190. try:
  1191. fd, trainfile_name = tempfile.mkstemp(prefix="nltk-")
  1192. with open(trainfile_name, "w") as trainfile:
  1193. write_megam_file(
  1194. train_toks, encoding, trainfile, explicit=explicit, bernoulli=bernoulli
  1195. )
  1196. os.close(fd)
  1197. except (OSError, IOError, ValueError) as e:
  1198. raise ValueError("Error while creating megam training file: %s" % e)
  1199. # Run megam on the training file.
  1200. options = []
  1201. options += ["-nobias", "-repeat", "10"]
  1202. if explicit:
  1203. options += ["-explicit"]
  1204. if not bernoulli:
  1205. options += ["-fvals"]
  1206. if gaussian_prior_sigma:
  1207. # Lambda is just the precision of the Gaussian prior, i.e. it's the
  1208. # inverse variance, so the parameter conversion is 1.0/sigma**2.
  1209. # See http://www.umiacs.umd.edu/~hal/docs/daume04cg-bfgs.pdf.
  1210. inv_variance = 1.0 / gaussian_prior_sigma ** 2
  1211. else:
  1212. inv_variance = 0
  1213. options += ["-lambda", "%.2f" % inv_variance, "-tune"]
  1214. if trace < 3:
  1215. options += ["-quiet"]
  1216. if "max_iter" in kwargs:
  1217. options += ["-maxi", "%s" % kwargs["max_iter"]]
  1218. if "ll_delta" in kwargs:
  1219. # [xx] this is actually a perplexity delta, not a log
  1220. # likelihood delta
  1221. options += ["-dpp", "%s" % abs(kwargs["ll_delta"])]
  1222. if hasattr(encoding, "cost"):
  1223. options += ["-multilabel"] # each possible la
  1224. options += ["multiclass", trainfile_name]
  1225. stdout = call_megam(options)
  1226. # print('./megam_i686.opt ', ' '.join(options))
  1227. # Delete the training file
  1228. try:
  1229. os.remove(trainfile_name)
  1230. except (OSError, IOError) as e:
  1231. print("Warning: unable to delete %s: %s" % (trainfile_name, e))
  1232. # Parse the generated weight vector.
  1233. weights = parse_megam_weights(stdout, encoding.length(), explicit)
  1234. # Convert from base-e to base-2 weights.
  1235. weights *= numpy.log2(numpy.e)
  1236. # Build the classifier
  1237. return MaxentClassifier(encoding, weights)
  1238. ######################################################################
  1239. # { Classifier Trainer: tadm
  1240. ######################################################################
  1241. class TadmMaxentClassifier(MaxentClassifier):
  1242. @classmethod
  1243. def train(cls, train_toks, **kwargs):
  1244. algorithm = kwargs.get("algorithm", "tao_lmvm")
  1245. trace = kwargs.get("trace", 3)
  1246. encoding = kwargs.get("encoding", None)
  1247. labels = kwargs.get("labels", None)
  1248. sigma = kwargs.get("gaussian_prior_sigma", 0)
  1249. count_cutoff = kwargs.get("count_cutoff", 0)
  1250. max_iter = kwargs.get("max_iter")
  1251. ll_delta = kwargs.get("min_lldelta")
  1252. # Construct an encoding from the training data.
  1253. if not encoding:
  1254. encoding = TadmEventMaxentFeatureEncoding.train(
  1255. train_toks, count_cutoff, labels=labels
  1256. )
  1257. trainfile_fd, trainfile_name = tempfile.mkstemp(
  1258. prefix="nltk-tadm-events-", suffix=".gz"
  1259. )
  1260. weightfile_fd, weightfile_name = tempfile.mkstemp(prefix="nltk-tadm-weights-")
  1261. trainfile = gzip_open_unicode(trainfile_name, "w")
  1262. write_tadm_file(train_toks, encoding, trainfile)
  1263. trainfile.close()
  1264. options = []
  1265. options.extend(["-monitor"])
  1266. options.extend(["-method", algorithm])
  1267. if sigma:
  1268. options.extend(["-l2", "%.6f" % sigma ** 2])
  1269. if max_iter:
  1270. options.extend(["-max_it", "%d" % max_iter])
  1271. if ll_delta:
  1272. options.extend(["-fatol", "%.6f" % abs(ll_delta)])
  1273. options.extend(["-events_in", trainfile_name])
  1274. options.extend(["-params_out", weightfile_name])
  1275. if trace < 3:
  1276. options.extend(["2>&1"])
  1277. else:
  1278. options.extend(["-summary"])
  1279. call_tadm(options)
  1280. with open(weightfile_name, "r") as weightfile:
  1281. weights = parse_tadm_weights(weightfile)
  1282. os.remove(trainfile_name)
  1283. os.remove(weightfile_name)
  1284. # Convert from base-e to base-2 weights.
  1285. weights *= numpy.log2(numpy.e)
  1286. # Build the classifier
  1287. return cls(encoding, weights)
  1288. ######################################################################
  1289. # { Demo
  1290. ######################################################################
  1291. def demo():
  1292. from nltk.classify.util import names_demo
  1293. classifier = names_demo(MaxentClassifier.train)
  1294. if __name__ == "__main__":
  1295. demo()