ibm2.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: IBM Model 2
  3. #
  4. # Copyright (C) 2001-2013 NLTK Project
  5. # Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. Lexical translation model that considers word order.
  10. IBM Model 2 improves on Model 1 by accounting for word order.
  11. An alignment probability is introduced, a(i | j,l,m), which predicts
  12. a source word position, given its aligned target word's position.
  13. The EM algorithm used in Model 2 is:
  14. E step - In the training data, collect counts, weighted by prior
  15. probabilities.
  16. (a) count how many times a source language word is translated
  17. into a target language word
  18. (b) count how many times a particular position in the source
  19. sentence is aligned to a particular position in the target
  20. sentence
  21. M step - Estimate new probabilities based on the counts from the E step
  22. Notations:
  23. i: Position in the source sentence
  24. Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
  25. j: Position in the target sentence
  26. Valid values are 1, 2, ..., length of target sentence
  27. l: Number of words in the source sentence, excluding NULL
  28. m: Number of words in the target sentence
  29. s: A word in the source language
  30. t: A word in the target language
  31. References:
  32. Philipp Koehn. 2010. Statistical Machine Translation.
  33. Cambridge University Press, New York.
  34. Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
  35. Robert L. Mercer. 1993. The Mathematics of Statistical Machine
  36. Translation: Parameter Estimation. Computational Linguistics, 19 (2),
  37. 263-311.
  38. """
  39. import warnings
  40. from collections import defaultdict
  41. from nltk.translate import AlignedSent
  42. from nltk.translate import Alignment
  43. from nltk.translate import IBMModel
  44. from nltk.translate import IBMModel1
  45. from nltk.translate.ibm_model import Counts
  46. class IBMModel2(IBMModel):
  47. """
  48. Lexical translation model that considers word order
  49. >>> bitext = []
  50. >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
  51. >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
  52. >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
  53. >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
  54. >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
  55. >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
  56. >>> ibm2 = IBMModel2(bitext, 5)
  57. >>> print(round(ibm2.translation_table['buch']['book'], 3))
  58. 1.0
  59. >>> print(round(ibm2.translation_table['das']['book'], 3))
  60. 0.0
  61. >>> print(round(ibm2.translation_table['buch'][None], 3))
  62. 0.0
  63. >>> print(round(ibm2.translation_table['ja'][None], 3))
  64. 0.0
  65. >>> print(ibm2.alignment_table[1][1][2][2])
  66. 0.938...
  67. >>> print(round(ibm2.alignment_table[1][2][2][2], 3))
  68. 0.0
  69. >>> print(round(ibm2.alignment_table[2][2][4][5], 3))
  70. 1.0
  71. >>> test_sentence = bitext[2]
  72. >>> test_sentence.words
  73. ['das', 'buch', 'ist', 'ja', 'klein']
  74. >>> test_sentence.mots
  75. ['the', 'book', 'is', 'small']
  76. >>> test_sentence.alignment
  77. Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
  78. """
  79. def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
  80. """
  81. Train on ``sentence_aligned_corpus`` and create a lexical
  82. translation model and an alignment model.
  83. Translation direction is from ``AlignedSent.mots`` to
  84. ``AlignedSent.words``.
  85. :param sentence_aligned_corpus: Sentence-aligned parallel corpus
  86. :type sentence_aligned_corpus: list(AlignedSent)
  87. :param iterations: Number of iterations to run training algorithm
  88. :type iterations: int
  89. :param probability_tables: Optional. Use this to pass in custom
  90. probability values. If not specified, probabilities will be
  91. set to a uniform distribution, or some other sensible value.
  92. If specified, all the following entries must be present:
  93. ``translation_table``, ``alignment_table``.
  94. See ``IBMModel`` for the type and purpose of these tables.
  95. :type probability_tables: dict[str]: object
  96. """
  97. super(IBMModel2, self).__init__(sentence_aligned_corpus)
  98. if probability_tables is None:
  99. # Get translation probabilities from IBM Model 1
  100. # Run more iterations of training for Model 1, since it is
  101. # faster than Model 2
  102. ibm1 = IBMModel1(sentence_aligned_corpus, 2 * iterations)
  103. self.translation_table = ibm1.translation_table
  104. self.set_uniform_probabilities(sentence_aligned_corpus)
  105. else:
  106. # Set user-defined probabilities
  107. self.translation_table = probability_tables["translation_table"]
  108. self.alignment_table = probability_tables["alignment_table"]
  109. for n in range(0, iterations):
  110. self.train(sentence_aligned_corpus)
  111. self.align_all(sentence_aligned_corpus)
  112. def set_uniform_probabilities(self, sentence_aligned_corpus):
  113. # a(i | j,l,m) = 1 / (l+1) for all i, j, l, m
  114. l_m_combinations = set()
  115. for aligned_sentence in sentence_aligned_corpus:
  116. l = len(aligned_sentence.mots)
  117. m = len(aligned_sentence.words)
  118. if (l, m) not in l_m_combinations:
  119. l_m_combinations.add((l, m))
  120. initial_prob = 1 / (l + 1)
  121. if initial_prob < IBMModel.MIN_PROB:
  122. warnings.warn(
  123. "A source sentence is too long ("
  124. + str(l)
  125. + " words). Results may be less accurate."
  126. )
  127. for i in range(0, l + 1):
  128. for j in range(1, m + 1):
  129. self.alignment_table[i][j][l][m] = initial_prob
  130. def train(self, parallel_corpus):
  131. counts = Model2Counts()
  132. for aligned_sentence in parallel_corpus:
  133. src_sentence = [None] + aligned_sentence.mots
  134. trg_sentence = ["UNUSED"] + aligned_sentence.words # 1-indexed
  135. l = len(aligned_sentence.mots)
  136. m = len(aligned_sentence.words)
  137. # E step (a): Compute normalization factors to weigh counts
  138. total_count = self.prob_all_alignments(src_sentence, trg_sentence)
  139. # E step (b): Collect counts
  140. for j in range(1, m + 1):
  141. t = trg_sentence[j]
  142. for i in range(0, l + 1):
  143. s = src_sentence[i]
  144. count = self.prob_alignment_point(i, j, src_sentence, trg_sentence)
  145. normalized_count = count / total_count[t]
  146. counts.update_lexical_translation(normalized_count, s, t)
  147. counts.update_alignment(normalized_count, i, j, l, m)
  148. # M step: Update probabilities with maximum likelihood estimates
  149. self.maximize_lexical_translation_probabilities(counts)
  150. self.maximize_alignment_probabilities(counts)
  151. def maximize_alignment_probabilities(self, counts):
  152. MIN_PROB = IBMModel.MIN_PROB
  153. for i, j_s in counts.alignment.items():
  154. for j, src_sentence_lengths in j_s.items():
  155. for l, trg_sentence_lengths in src_sentence_lengths.items():
  156. for m in trg_sentence_lengths:
  157. estimate = (
  158. counts.alignment[i][j][l][m]
  159. / counts.alignment_for_any_i[j][l][m]
  160. )
  161. self.alignment_table[i][j][l][m] = max(estimate, MIN_PROB)
  162. def prob_all_alignments(self, src_sentence, trg_sentence):
  163. """
  164. Computes the probability of all possible word alignments,
  165. expressed as a marginal distribution over target words t
  166. Each entry in the return value represents the contribution to
  167. the total alignment probability by the target word t.
  168. To obtain probability(alignment | src_sentence, trg_sentence),
  169. simply sum the entries in the return value.
  170. :return: Probability of t for all s in ``src_sentence``
  171. :rtype: dict(str): float
  172. """
  173. alignment_prob_for_t = defaultdict(lambda: 0.0)
  174. for j in range(1, len(trg_sentence)):
  175. t = trg_sentence[j]
  176. for i in range(0, len(src_sentence)):
  177. alignment_prob_for_t[t] += self.prob_alignment_point(
  178. i, j, src_sentence, trg_sentence
  179. )
  180. return alignment_prob_for_t
  181. def prob_alignment_point(self, i, j, src_sentence, trg_sentence):
  182. """
  183. Probability that position j in ``trg_sentence`` is aligned to
  184. position i in the ``src_sentence``
  185. """
  186. l = len(src_sentence) - 1
  187. m = len(trg_sentence) - 1
  188. s = src_sentence[i]
  189. t = trg_sentence[j]
  190. return self.translation_table[t][s] * self.alignment_table[i][j][l][m]
  191. def prob_t_a_given_s(self, alignment_info):
  192. """
  193. Probability of target sentence and an alignment given the
  194. source sentence
  195. """
  196. prob = 1.0
  197. l = len(alignment_info.src_sentence) - 1
  198. m = len(alignment_info.trg_sentence) - 1
  199. for j, i in enumerate(alignment_info.alignment):
  200. if j == 0:
  201. continue # skip the dummy zeroeth element
  202. trg_word = alignment_info.trg_sentence[j]
  203. src_word = alignment_info.src_sentence[i]
  204. prob *= (
  205. self.translation_table[trg_word][src_word]
  206. * self.alignment_table[i][j][l][m]
  207. )
  208. return max(prob, IBMModel.MIN_PROB)
  209. def align_all(self, parallel_corpus):
  210. for sentence_pair in parallel_corpus:
  211. self.align(sentence_pair)
  212. def align(self, sentence_pair):
  213. """
  214. Determines the best word alignment for one sentence pair from
  215. the corpus that the model was trained on.
  216. The best alignment will be set in ``sentence_pair`` when the
  217. method returns. In contrast with the internal implementation of
  218. IBM models, the word indices in the ``Alignment`` are zero-
  219. indexed, not one-indexed.
  220. :param sentence_pair: A sentence in the source language and its
  221. counterpart sentence in the target language
  222. :type sentence_pair: AlignedSent
  223. """
  224. best_alignment = []
  225. l = len(sentence_pair.mots)
  226. m = len(sentence_pair.words)
  227. for j, trg_word in enumerate(sentence_pair.words):
  228. # Initialize trg_word to align with the NULL token
  229. best_prob = (
  230. self.translation_table[trg_word][None]
  231. * self.alignment_table[0][j + 1][l][m]
  232. )
  233. best_prob = max(best_prob, IBMModel.MIN_PROB)
  234. best_alignment_point = None
  235. for i, src_word in enumerate(sentence_pair.mots):
  236. align_prob = (
  237. self.translation_table[trg_word][src_word]
  238. * self.alignment_table[i + 1][j + 1][l][m]
  239. )
  240. if align_prob >= best_prob:
  241. best_prob = align_prob
  242. best_alignment_point = i
  243. best_alignment.append((j, best_alignment_point))
  244. sentence_pair.alignment = Alignment(best_alignment)
  245. class Model2Counts(Counts):
  246. """
  247. Data object to store counts of various parameters during training.
  248. Includes counts for alignment.
  249. """
  250. def __init__(self):
  251. super(Model2Counts, self).__init__()
  252. self.alignment = defaultdict(
  253. lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
  254. )
  255. self.alignment_for_any_i = defaultdict(
  256. lambda: defaultdict(lambda: defaultdict(lambda: 0.0))
  257. )
  258. def update_lexical_translation(self, count, s, t):
  259. self.t_given_s[t][s] += count
  260. self.any_t_given_s[s] += count
  261. def update_alignment(self, count, i, j, l, m):
  262. self.alignment[i][j][l][m] += count
  263. self.alignment_for_any_i[j][l][m] += count