association.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461
  1. # Natural Language Toolkit: Ngram Association Measures
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Joel Nothman <jnothman@student.usyd.edu.au>
  5. # URL: <http://nltk.org>
  6. # For license information, see LICENSE.TXT
  7. """
  8. Provides scoring functions for a number of association measures through a
  9. generic, abstract implementation in ``NgramAssocMeasures``, and n-specific
  10. ``BigramAssocMeasures`` and ``TrigramAssocMeasures``.
  11. """
  12. import math as _math
  13. from abc import ABCMeta, abstractmethod
  14. from functools import reduce
  15. _log2 = lambda x: _math.log(x, 2.0)
  16. _ln = _math.log
  17. _product = lambda s: reduce(lambda x, y: x * y, s)
  18. _SMALL = 1e-20
  19. try:
  20. from scipy.stats import fisher_exact
  21. except ImportError:
  22. def fisher_exact(*_args, **_kwargs):
  23. raise NotImplementedError
  24. ### Indices to marginals arguments:
  25. NGRAM = 0
  26. """Marginals index for the ngram count"""
  27. UNIGRAMS = -2
  28. """Marginals index for a tuple of each unigram count"""
  29. TOTAL = -1
  30. """Marginals index for the number of words in the data"""
  31. class NgramAssocMeasures(metaclass=ABCMeta):
  32. """
  33. An abstract class defining a collection of generic association measures.
  34. Each public method returns a score, taking the following arguments::
  35. score_fn(count_of_ngram,
  36. (count_of_n-1gram_1, ..., count_of_n-1gram_j),
  37. (count_of_n-2gram_1, ..., count_of_n-2gram_k),
  38. ...,
  39. (count_of_1gram_1, ..., count_of_1gram_n),
  40. count_of_total_words)
  41. See ``BigramAssocMeasures`` and ``TrigramAssocMeasures``
  42. Inheriting classes should define a property _n, and a method _contingency
  43. which calculates contingency values from marginals in order for all
  44. association measures defined here to be usable.
  45. """
  46. _n = 0
  47. @staticmethod
  48. @abstractmethod
  49. def _contingency(*marginals):
  50. """Calculates values of a contingency table from marginal values."""
  51. raise NotImplementedError(
  52. "The contingency table is not available" "in the general ngram case"
  53. )
  54. @staticmethod
  55. @abstractmethod
  56. def _marginals(*contingency):
  57. """Calculates values of contingency table marginals from its values."""
  58. raise NotImplementedError(
  59. "The contingency table is not available" "in the general ngram case"
  60. )
  61. @classmethod
  62. def _expected_values(cls, cont):
  63. """Calculates expected values for a contingency table."""
  64. n_all = sum(cont)
  65. bits = [1 << i for i in range(cls._n)]
  66. # For each contingency table cell
  67. for i in range(len(cont)):
  68. # Yield the expected value
  69. yield (
  70. _product(
  71. sum(cont[x] for x in range(2 ** cls._n) if (x & j) == (i & j))
  72. for j in bits
  73. )
  74. / (n_all ** (cls._n - 1))
  75. )
  76. @staticmethod
  77. def raw_freq(*marginals):
  78. """Scores ngrams by their frequency"""
  79. return marginals[NGRAM] / marginals[TOTAL]
  80. @classmethod
  81. def student_t(cls, *marginals):
  82. """Scores ngrams using Student's t test with independence hypothesis
  83. for unigrams, as in Manning and Schutze 5.3.1.
  84. """
  85. return (
  86. marginals[NGRAM]
  87. - _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1))
  88. ) / (marginals[NGRAM] + _SMALL) ** 0.5
  89. @classmethod
  90. def chi_sq(cls, *marginals):
  91. """Scores ngrams using Pearson's chi-square as in Manning and Schutze
  92. 5.3.3.
  93. """
  94. cont = cls._contingency(*marginals)
  95. exps = cls._expected_values(cont)
  96. return sum((obs - exp) ** 2 / (exp + _SMALL) for obs, exp in zip(cont, exps))
  97. @staticmethod
  98. def mi_like(*marginals, **kwargs):
  99. """Scores ngrams using a variant of mutual information. The keyword
  100. argument power sets an exponent (default 3) for the numerator. No
  101. logarithm of the result is calculated.
  102. """
  103. return marginals[NGRAM] ** kwargs.get("power", 3) / _product(
  104. marginals[UNIGRAMS]
  105. )
  106. @classmethod
  107. def pmi(cls, *marginals):
  108. """Scores ngrams by pointwise mutual information, as in Manning and
  109. Schutze 5.4.
  110. """
  111. return _log2(marginals[NGRAM] * marginals[TOTAL] ** (cls._n - 1)) - _log2(
  112. _product(marginals[UNIGRAMS])
  113. )
  114. @classmethod
  115. def likelihood_ratio(cls, *marginals):
  116. """Scores ngrams using likelihood ratios as in Manning and Schutze 5.3.4.
  117. """
  118. cont = cls._contingency(*marginals)
  119. return cls._n * sum(
  120. obs * _ln(obs / (exp + _SMALL) + _SMALL)
  121. for obs, exp in zip(cont, cls._expected_values(cont))
  122. )
  123. @classmethod
  124. def poisson_stirling(cls, *marginals):
  125. """Scores ngrams using the Poisson-Stirling measure."""
  126. exp = _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1))
  127. return marginals[NGRAM] * (_log2(marginals[NGRAM] / exp) - 1)
  128. @classmethod
  129. def jaccard(cls, *marginals):
  130. """Scores ngrams using the Jaccard index."""
  131. cont = cls._contingency(*marginals)
  132. return cont[0] / sum(cont[:-1])
  133. class BigramAssocMeasures(NgramAssocMeasures):
  134. """
  135. A collection of bigram association measures. Each association measure
  136. is provided as a function with three arguments::
  137. bigram_score_fn(n_ii, (n_ix, n_xi), n_xx)
  138. The arguments constitute the marginals of a contingency table, counting
  139. the occurrences of particular events in a corpus. The letter i in the
  140. suffix refers to the appearance of the word in question, while x indicates
  141. the appearance of any word. Thus, for example:
  142. n_ii counts (w1, w2), i.e. the bigram being scored
  143. n_ix counts (w1, *)
  144. n_xi counts (*, w2)
  145. n_xx counts (*, *), i.e. any bigram
  146. This may be shown with respect to a contingency table::
  147. w1 ~w1
  148. ------ ------
  149. w2 | n_ii | n_oi | = n_xi
  150. ------ ------
  151. ~w2 | n_io | n_oo |
  152. ------ ------
  153. = n_ix TOTAL = n_xx
  154. """
  155. _n = 2
  156. @staticmethod
  157. def _contingency(n_ii, n_ix_xi_tuple, n_xx):
  158. """Calculates values of a bigram contingency table from marginal values."""
  159. (n_ix, n_xi) = n_ix_xi_tuple
  160. n_oi = n_xi - n_ii
  161. n_io = n_ix - n_ii
  162. return (n_ii, n_oi, n_io, n_xx - n_ii - n_oi - n_io)
  163. @staticmethod
  164. def _marginals(n_ii, n_oi, n_io, n_oo):
  165. """Calculates values of contingency table marginals from its values."""
  166. return (n_ii, (n_oi + n_ii, n_io + n_ii), n_oo + n_oi + n_io + n_ii)
  167. @staticmethod
  168. def _expected_values(cont):
  169. """Calculates expected values for a contingency table."""
  170. n_xx = sum(cont)
  171. # For each contingency table cell
  172. for i in range(4):
  173. yield (cont[i] + cont[i ^ 1]) * (cont[i] + cont[i ^ 2]) / n_xx
  174. @classmethod
  175. def phi_sq(cls, *marginals):
  176. """Scores bigrams using phi-square, the square of the Pearson correlation
  177. coefficient.
  178. """
  179. n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
  180. return (n_ii * n_oo - n_io * n_oi) ** 2 / (
  181. (n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo)
  182. )
  183. @classmethod
  184. def chi_sq(cls, n_ii, n_ix_xi_tuple, n_xx):
  185. """Scores bigrams using chi-square, i.e. phi-sq multiplied by the number
  186. of bigrams, as in Manning and Schutze 5.3.3.
  187. """
  188. (n_ix, n_xi) = n_ix_xi_tuple
  189. return n_xx * cls.phi_sq(n_ii, (n_ix, n_xi), n_xx)
  190. @classmethod
  191. def fisher(cls, *marginals):
  192. """Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less
  193. sensitive to small counts than PMI or Chi Sq, but also more expensive
  194. to compute. Requires scipy.
  195. """
  196. n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
  197. (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative="less")
  198. return pvalue
  199. @staticmethod
  200. def dice(n_ii, n_ix_xi_tuple, n_xx):
  201. """Scores bigrams using Dice's coefficient."""
  202. (n_ix, n_xi) = n_ix_xi_tuple
  203. return 2 * n_ii / (n_ix + n_xi)
  204. class TrigramAssocMeasures(NgramAssocMeasures):
  205. """
  206. A collection of trigram association measures. Each association measure
  207. is provided as a function with four arguments::
  208. trigram_score_fn(n_iii,
  209. (n_iix, n_ixi, n_xii),
  210. (n_ixx, n_xix, n_xxi),
  211. n_xxx)
  212. The arguments constitute the marginals of a contingency table, counting
  213. the occurrences of particular events in a corpus. The letter i in the
  214. suffix refers to the appearance of the word in question, while x indicates
  215. the appearance of any word. Thus, for example:
  216. n_iii counts (w1, w2, w3), i.e. the trigram being scored
  217. n_ixx counts (w1, *, *)
  218. n_xxx counts (*, *, *), i.e. any trigram
  219. """
  220. _n = 3
  221. @staticmethod
  222. def _contingency(n_iii, n_iix_tuple, n_ixx_tuple, n_xxx):
  223. """Calculates values of a trigram contingency table (or cube) from
  224. marginal values.
  225. >>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000)
  226. (1, 0, 0, 0, 0, 72, 0, 1927)
  227. """
  228. (n_iix, n_ixi, n_xii) = n_iix_tuple
  229. (n_ixx, n_xix, n_xxi) = n_ixx_tuple
  230. n_oii = n_xii - n_iii
  231. n_ioi = n_ixi - n_iii
  232. n_iio = n_iix - n_iii
  233. n_ooi = n_xxi - n_iii - n_oii - n_ioi
  234. n_oio = n_xix - n_iii - n_oii - n_iio
  235. n_ioo = n_ixx - n_iii - n_ioi - n_iio
  236. n_ooo = n_xxx - n_iii - n_oii - n_ioi - n_iio - n_ooi - n_oio - n_ioo
  237. return (n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo)
  238. @staticmethod
  239. def _marginals(*contingency):
  240. """Calculates values of contingency table marginals from its values.
  241. >>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927)
  242. (1, (1, 1, 1), (1, 73, 1), 2000)
  243. """
  244. n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo = contingency
  245. return (
  246. n_iii,
  247. (n_iii + n_iio, n_iii + n_ioi, n_iii + n_oii),
  248. (
  249. n_iii + n_ioi + n_iio + n_ioo,
  250. n_iii + n_oii + n_iio + n_oio,
  251. n_iii + n_oii + n_ioi + n_ooi,
  252. ),
  253. sum(contingency),
  254. )
  255. class QuadgramAssocMeasures(NgramAssocMeasures):
  256. """
  257. A collection of quadgram association measures. Each association measure
  258. is provided as a function with five arguments::
  259. trigram_score_fn(n_iiii,
  260. (n_iiix, n_iixi, n_ixii, n_xiii),
  261. (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
  262. (n_ixxx, n_xixx, n_xxix, n_xxxi),
  263. n_all)
  264. The arguments constitute the marginals of a contingency table, counting
  265. the occurrences of particular events in a corpus. The letter i in the
  266. suffix refers to the appearance of the word in question, while x indicates
  267. the appearance of any word. Thus, for example:
  268. n_iiii counts (w1, w2, w3, w4), i.e. the quadgram being scored
  269. n_ixxi counts (w1, *, *, w4)
  270. n_xxxx counts (*, *, *, *), i.e. any quadgram
  271. """
  272. _n = 4
  273. @staticmethod
  274. def _contingency(n_iiii, n_iiix_tuple, n_iixx_tuple, n_ixxx_tuple, n_xxxx):
  275. """Calculates values of a quadgram contingency table from
  276. marginal values.
  277. """
  278. (n_iiix, n_iixi, n_ixii, n_xiii) = n_iiix_tuple
  279. (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix) = n_iixx_tuple
  280. (n_ixxx, n_xixx, n_xxix, n_xxxi) = n_ixxx_tuple
  281. n_oiii = n_xiii - n_iiii
  282. n_ioii = n_ixii - n_iiii
  283. n_iioi = n_iixi - n_iiii
  284. n_ooii = n_xxii - n_iiii - n_oiii - n_ioii
  285. n_oioi = n_xixi - n_iiii - n_oiii - n_iioi
  286. n_iooi = n_ixxi - n_iiii - n_ioii - n_iioi
  287. n_oooi = n_xxxi - n_iiii - n_oiii - n_ioii - n_iioi - n_ooii - n_iooi - n_oioi
  288. n_iiio = n_iiix - n_iiii
  289. n_oiio = n_xiix - n_iiii - n_oiii - n_iiio
  290. n_ioio = n_ixix - n_iiii - n_ioii - n_iiio
  291. n_ooio = n_xxix - n_iiii - n_oiii - n_ioii - n_iiio - n_ooii - n_ioio - n_oiio
  292. n_iioo = n_iixx - n_iiii - n_iioi - n_iiio
  293. n_oioo = n_xixx - n_iiii - n_oiii - n_iioi - n_iiio - n_oioi - n_oiio - n_iioo
  294. n_iooo = n_ixxx - n_iiii - n_ioii - n_iioi - n_iiio - n_iooi - n_iioo - n_ioio
  295. n_oooo = (
  296. n_xxxx
  297. - n_iiii
  298. - n_oiii
  299. - n_ioii
  300. - n_iioi
  301. - n_ooii
  302. - n_oioi
  303. - n_iooi
  304. - n_oooi
  305. - n_iiio
  306. - n_oiio
  307. - n_ioio
  308. - n_ooio
  309. - n_iioo
  310. - n_oioo
  311. - n_iooo
  312. )
  313. return (
  314. n_iiii,
  315. n_oiii,
  316. n_ioii,
  317. n_ooii,
  318. n_iioi,
  319. n_oioi,
  320. n_iooi,
  321. n_oooi,
  322. n_iiio,
  323. n_oiio,
  324. n_ioio,
  325. n_ooio,
  326. n_iioo,
  327. n_oioo,
  328. n_iooo,
  329. n_oooo,
  330. )
  331. @staticmethod
  332. def _marginals(*contingency):
  333. """Calculates values of contingency table marginals from its values.
  334. QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653)
  335. (1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540)
  336. """
  337. n_iiii, n_oiii, n_ioii, n_ooii, n_iioi, n_oioi, n_iooi, n_oooi, n_iiio, n_oiio, n_ioio, n_ooio, n_iioo, n_oioo, n_iooo, n_oooo = (
  338. contingency
  339. )
  340. n_iiix = n_iiii + n_iiio
  341. n_iixi = n_iiii + n_iioi
  342. n_ixii = n_iiii + n_ioii
  343. n_xiii = n_iiii + n_oiii
  344. n_iixx = n_iiii + n_iioi + n_iiio + n_iioo
  345. n_ixix = n_iiii + n_ioii + n_iiio + n_ioio
  346. n_ixxi = n_iiii + n_ioii + n_iioi + n_iooi
  347. n_xixi = n_iiii + n_oiii + n_iioi + n_oioi
  348. n_xxii = n_iiii + n_oiii + n_ioii + n_ooii
  349. n_xiix = n_iiii + n_oiii + n_iiio + n_oiio
  350. n_ixxx = n_iiii + n_ioii + n_iioi + n_iiio + n_iooi + n_iioo + n_ioio + n_iooo
  351. n_xixx = n_iiii + n_oiii + n_iioi + n_iiio + n_oioi + n_oiio + n_iioo + n_oioo
  352. n_xxix = n_iiii + n_oiii + n_ioii + n_iiio + n_ooii + n_ioio + n_oiio + n_ooio
  353. n_xxxi = n_iiii + n_oiii + n_ioii + n_iioi + n_ooii + n_iooi + n_oioi + n_oooi
  354. n_all = sum(contingency)
  355. return (
  356. n_iiii,
  357. (n_iiix, n_iixi, n_ixii, n_xiii),
  358. (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
  359. (n_ixxx, n_xixx, n_xxix, n_xxxi),
  360. n_all,
  361. )
  362. class ContingencyMeasures(object):
  363. """Wraps NgramAssocMeasures classes such that the arguments of association
  364. measures are contingency table values rather than marginals.
  365. """
  366. def __init__(self, measures):
  367. """Constructs a ContingencyMeasures given a NgramAssocMeasures class"""
  368. self.__class__.__name__ = "Contingency" + measures.__class__.__name__
  369. for k in dir(measures):
  370. if k.startswith("__"):
  371. continue
  372. v = getattr(measures, k)
  373. if not k.startswith("_"):
  374. v = self._make_contingency_fn(measures, v)
  375. setattr(self, k, v)
  376. @staticmethod
  377. def _make_contingency_fn(measures, old_fn):
  378. """From an association measure function, produces a new function which
  379. accepts contingency table values as its arguments.
  380. """
  381. def res(*contingency):
  382. return old_fn(*measures._marginals(*contingency))
  383. res.__doc__ = old_fn.__doc__
  384. res.__name__ = old_fn.__name__
  385. return res